

/************************************************************************************************/
/************************************************************************************************/
********** The Following Code Generates the Results for ;
********** "Unsmoothing Returns of Illiquid Funds";
********** Authors: Spencer J Couts, Andrei S Gon�alves, Andrea Rossi;
********** If you use this code, please cite Couts, Gon�alves, and Rossi (2024)


**Main Notes: The Code is written in SAS;
**Main Notes: This code includes the AR-based unmoothing technique (used for Real Estate funds) and the MA-based unsmoothing technique (used for Hedge Funds);
**Main Notes: The fund-by-fund unsmoothing techniques (which we refer to as 1-step unsmoothing) were first introduced in the academic literature by Geltner (1991,1993) and Getmansky, Lo, and Makarov (2004);
**Main Notes: please contact the authors for help with the code or data sharing (NB: the original data belongs to the data providers listed in the paper - hence the data can be shared only with parties that have acquired the rights to the data;

** CODE 1: AR UNSMOOTHING/REAL ESTATE;
****Produces: Figure 1 (RE part), Table 6, Table 7;

** CODE 2: MA UNSMOOTHING MAIN/HEDGE FUNDS;
****Produces: Figure 1 (HF part), Figure 2, 4, 5, 6, Table 1, 2, 3,;
****Appendix for Code 2: Simulation to Verify Convergence/Stability/Bias Of MA Estimation Procedure;

** CODE 3: ANALYSIS OF BETA-SORTED HEDGE FUND PORTFOLIO RETURNS;
****Produces: Table 4;

** CODE 4: PREDICTABILITY OF FUND ALPHAS;
****Produces: Figure 3;

** CODE 5: SIMULATION: MA(1) WITH 1-FACTOR MODEL;
****Produces: Table 5, Panel A;

** CODE 6: SIMULATION: MA(3) WITH 8-FACTOR MODEL;
****Produces: Table 5, Panel B;




*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 1: AR UNSMOOTHING/REAL ESTATE;
****Produces: Figure 1 (RE part), Table 6, Table 7;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;


************ Unsmooth RE Fund Returns using AR(2) process;
*** This part of the code produces results reported in: ;
*Figure 1 (Real Estate Part of the Figure), Table 6, Table 7;


*Description of key Real Estate Data variables;
*fundid_mer = fund ID;
*ret = reported return for quarter t;
*yyyyq = date variable;
*net_aum3 = AUM of fund as of end of period t;
*categ = type of fund;

data re00;
set reautoc.re_fund_data_08a;
if categ = ' ' then delete;
rep_ret = ret; 
ret = log(1+ret); *detail: consistent with methodology, the code unsmooths log-returns;
drop net_aum1 net_aum2;
run;

proc sort data = re00; by fundid_mer yyyyq; run;

data re00;
set re00;
lag_net_aum3 = lag(net_aum3);
if fundid_mer ne lag(fundid_mer) then lag_net_aum3 = .;
run;

*main sample period;
*keep data 2 quarters before 1994, since AR(2) requires two lags;
data re00;
set re00;
if yyyyq ge 19933;
if yyyyq le 20174;
run;

*make yyyyq sequence#;
data w_q_seq; set re00; keep yyyyq; run;
proc sort data = w_q_seq nodupkey; by yyyyq; run;
data w_q_seq; set w_q_seq; yyyyq_seq + 1; run;

proc sort data = re00; by yyyyq; run;

data re00; 
merge re00 w_q_seq;
by yyyyq; 
run;

*count obs by fund;
proc sort data = re00; by fundid_mer; run;

proc summary data = re00; 
var ret;
output out = w_mean_ret_1 std = std_ret_fundid;
by fundid_mer; run;

data re00; 
merge re00 w_mean_ret_1;
by fundid_mer; 
fundid_mer_obs = _FREQ_;
if fundid_mer_obs ge 36; *impose requirement on minimum # of periods;
drop _FREQ_ _TYPE_;
run;

proc sort data = re00; by fundid_mer yyyyq; run;

data re00; 
set re00;
by fundid_mer;
if first.fundid_mer then fund_seq = 1;
else fund_seq + 1;
run; 

*check for holes in each fund's time series of data;
proc sort data = re00; by fundid_mer yyyyq; run;
data re00; 
set re00;
check_qq = yyyyq_seq - lag(yyyyq_seq) - 1;
if fund_seq = 1 then check_qq = 0;
run;

*There is 1 fund with a hole - eliminate observations after the missing return;
data miss_obs; set re00; if check_qq ge 1; miss_fund_seq = fund_seq; fund_miss = 1; keep fundid_mer fund_miss miss_fund_seq; run;
proc sort data = re00; by fundid_mer yyyyq; run;
proc sort data = miss_obs; by fundid_mer; run;
data re00;
merge re00 miss_obs; 
by fundid_mer;
run;

data re00; *drop observations;
set re00;
if fund_miss = 1 and fund_seq ge miss_fund_seq then delete;
run;

*number of funds in each quarter;
proc sort data = re00; by yyyyq; run;
proc summary data = re00; 
var ret net_aum3;
output out = w_mm_n mean = av_ret del1 sum = del2 sum_net_aum3;
by yyyyq; run;

*First quarter for each fund;
data first_qq_by_fund; 
set re00;
if fund_seq = 1;
first_yyyyqq = yyyyq;
keep fundid_mer first_yyyyqq;
run;

proc sort data = re00; by fundid_mer; run;
proc sort data = first_qq_by_fund; by fundid_mer; run;

data re00;
merge re00 first_qq_by_fund; 
by fundid_mer; 
drop fund_miss miss_fund_seq;
run;


*Sample Stats;
proc sort data = re00; by open_end_all open_end_odce closed_end fundid_mer; run;
proc summary data = re00; 
var fundid_mer_obs;
output out = sample_stats_by_fund
mean = fundid_mer_obs;
by open_end_all open_end_odce closed_end fundid_mer; run;

proc summary data = sample_stats_by_fund; 
var fundid_mer_obs;
output out = sample_stats
mean = /autoname;
by open_end_all open_end_odce closed_end; run;

proc summary data = sample_stats_by_fund; 
var fundid_mer_obs;
output out = sample_stats_all
mean = /autoname; run;


/******************************************************************************************************************/
**** Part 1: simple AR(2) unsmoothing method;

proc sort data = re00; by fundid_mer; run;
proc summary data = re00; 
var ret;
output out = w_av_fundret mean = av_ret_fundid;
by fundid_mer; run;

*de-mean fund return;
data re00; 
merge re00 w_av_fundret; 
by fundid_mer;
dem_ret = ret - av_ret_fundid;
drop _TYPE_ _FREQ_;
run;

*Use OLS to estimate AR(2) coefficient and to extract residuals;
proc sort data = re00; by fundid_mer yyyyq; run;

data re00; 
set re00; 
lag1_dem_ret = lag1(dem_ret);
if fund_seq = 1 then lag1_dem_ret = . ;
lag2_dem_ret = lag2(dem_ret);
if fund_seq le 2 then lag2_dem_ret = . ;
run;

proc sort data = re00; by fundid_mer yyyyq; run;

*OLS regression;
proc reg data = re00 tableout outest = v_AR2_fund_est noprint; 
model dem_ret = lag1_dem_ret lag2_dem_ret;
output out= re01
r= fund_ar2_resid;
by fundid_mer; 
run; quit;

*clean output and attach to main dataset;
data AR2_fund_est_coeff;
set v_AR2_fund_est;
if _TYPE_ = 'PARMS';
phi1_fund = lag1_dem_ret;
phi2_fund = lag2_dem_ret;
keep fundid_mer phi1_fund phi2_fund;
run;

data re01;
merge re01 AR2_fund_est_coeff;
by fundid_mer; 
label fund_ar2_resid = ' ';
run;

*calculate unsmoothed returns;
data re01;
set re01;
unsm_ret = av_ret_fundid + fund_ar2_resid/(1-phi1_fund-phi2_fund);
if fund_seq = 1 then unsm_ret = ret;
if fund_seq = 2 then unsm_ret = ret;
run;

*aggregate (EW) reported and unsmoothed returns;
proc sort data = re01; by yyyyq; run;
proc summary data = re01; 
var ret unsm_ret;
output out = re01_aggr_ew mean = ret_ew unsm_ret_ew;
by yyyyq; 
run;

*make lags;
proc sort data = re01_aggr_ew; by yyyyq; run;

data re01_aggr_ew;
set re01_aggr_ew;
n_funds = _FREQ_;
*reported;
lag1_ret_ew = lag1(ret_ew);
lag2_ret_ew = lag2(ret_ew);
lag3_ret_ew = lag3(ret_ew);
lag4_ret_ew = lag4(ret_ew);
*unsmoothed;
lag1_unsm_ret_ew = lag1(unsm_ret_ew);
lag2_unsm_ret_ew = lag2(unsm_ret_ew);
lag3_unsm_ret_ew = lag3(unsm_ret_ew);
lag4_unsm_ret_ew = lag4(unsm_ret_ew);
drop _TYPE_ _FREQ_;
run;


/****************************************************************************************************/
/***************************** Implement 3-step unsmoothing ************************************/

*start by de-meaning aggregate return;
data re01_aggr_ew; set re01_aggr_ew; dum = 1; run;
proc summary data = re01_aggr_ew;
var ret_ew;
output out = w_mean_ret_ew mean = av_ret_ew;
run;
data w_mean_ret_ew; set w_mean_ret_ew; dum = 1; drop _TYPE_ _FREQ_; run;

data re01_aggr_ew; 
merge re01_aggr_ew w_mean_ret_ew; 
by dum;
dem_ret_ew = ret_ew - av_ret_ew;
run;

*lag the demeaned aggregate EW return;
proc sort data = re01_aggr_ew; by yyyyq; run;

data v_re01_aggr_ew_lagged_clean;
set re01_aggr_ew;
lag1_dem_ret_ew = lag1(dem_ret_ew);
lag2_dem_ret_ew = lag2(dem_ret_ew);
keep yyyyq dem_ret_ew lag1_dem_ret_ew lag2_dem_ret_ew ret_ew;
run;


*regress fund returns on lag excess returns and lag aggregate returns;
data re01_clean;
set re01;
keep fundid_mer ret dem_ret yyyyq fund_seq unsm_ret av_ret_fundid
net_aum3 lag_net_aum3 lag1_dem_ret lag2_dem_ret
lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret;
run;

*attach aggregate EW return to each series of fund returns;
proc sort data = re01_clean; by yyyyq; run; 
proc sort data = v_re01_aggr_ew_lagged_clean; by yyyyq; run; 

data re01_clean;
merge re01_clean v_re01_aggr_ew_lagged_clean; 
by yyyyq; 
ret_ex_ew = ret - ret_ew;
run; 

*demean each fund excess return and aggregate return at fund level;
proc sort data = re01_clean; by fundid_mer; run;
proc summary data = re01_clean; 
var ret_ex_ew lag1_dem_ret_ew lag2_dem_ret_ew;
output out = w_mean_ret_ex_ew_fundid 
mean = av_ret_ex_ew_fundid av_lag1_dem_ret_ew_fundid av_lag2_dem_ret_ew_fundid;
by fundid_mer; run;
data re01_clean; 
merge re01_clean w_mean_ret_ex_ew_fundid; 
by fundid_mer; 
dem_ret_ex_ew = ret_ex_ew - av_ret_ex_ew_fundid;
dem_lag1_dem_ret_ew = lag1_dem_ret_ew - av_lag1_dem_ret_ew_fundid;
dem_lag2_dem_ret_ew = lag2_dem_ret_ew - av_lag2_dem_ret_ew_fundid;
drop _TYPE_ _FREQ_;
run;

*lag the regressors;
proc sort data = re01_clean; by fundid_mer yyyyq; run;

data re01_clean; 
set re01_clean; 
*1 lag;
lag1_dem_ret_ex_ew = lag1(dem_ret_ex_ew);
if fund_seq = 1 then lag1_dem_ret_ex_ew = . ;
if fund_seq = 1 then dem_lag1_dem_ret_ew = . ;
*2 lags;
lag2_dem_ret_ex_ew = lag2(dem_ret_ex_ew);
if fund_seq le 2 then dem_lag2_dem_ret_ew = . ;
run;

proc sort data = re01_clean; by fundid_mer yyyyq; run;

*OLS regression;
proc reg data = re01_clean tableout outest = v_AR2_ex_ew_est noprint; 
model dem_ret = lag1_dem_ret_ex_ew lag2_dem_ret_ex_ew dem_lag1_dem_ret_ew dem_lag2_dem_ret_ew;
output out= re03
r= ex_ew_ar2_resid;
by fundid_mer;
run; quit;

*clean output and attach to main dataset;
data AR2_ex_ew_est_coeff;
set v_AR2_ex_ew_est;
if _TYPE_ = 'PARMS';
phi1_fund_ex_ew = lag1_dem_ret_ex_ew;
pi1_fund_aggr_ew = dem_lag1_dem_ret_ew;
phi2_fund_ex_ew = lag2_dem_ret_ex_ew;
pi2_fund_aggr_ew = dem_lag2_dem_ret_ew;
keep fundid_mer phi1_fund_ex_ew pi1_fund_aggr_ew phi2_fund_ex_ew pi2_fund_aggr_ew;
run;

data re03;
merge re03 AR2_ex_ew_est_coeff;
by fundid_mer;
label ex_ew_ar2_resid = ' ';
pi1_fund_aggr_ew_au = pi1_fund_aggr_ew;
pi2_fund_aggr_ew_au = pi2_fund_aggr_ew;
if fund_seq le 2 then pi1_fund_aggr_ew_au = .;
if fund_seq le 2 then pi2_fund_aggr_ew_au = .;
run;

******************* STEP 2: Obtain eta_hat_t;
*Need: average residual epsilon at t, and average pi_i;
*this is the "aggregation step";

proc sort data = re03; by yyyyq; run;
proc summary data = re03; 
var ex_ew_ar2_resid pi1_fund_aggr_ew_au pi2_fund_aggr_ew_au;
output out = re03_step2 mean = /autoname;
by yyyyq; 
run;

data re03_step2; *recover eta_hat_t;
set re03_step2;
eta_hat_t = ex_ew_ar2_resid_mean/(1 -pi1_fund_aggr_ew_au_mean -pi2_fund_aggr_ew_au_mean);
drop _TYPE_ _FREQ_;
run;

****************** STEP 3: Use eta_hat_t to recover 3-step unsmoothed returns;

data re03_step3;
merge re03 re03_step2;
by yyyyq; 
run;

proc sort data = re03_step3; by fundid_mer; run;
proc summary data = re03_step3; 
var eta_hat_t;
output out = w_mean_eta_hat_fund mean = eta_hat_t_mean_fund;
by fundid_mer; 
run;

data re03_step3; 
merge re03_step3 w_mean_eta_hat_fund;
by fundid_mer; 
eta_hat_t_funddem = eta_hat_t - eta_hat_t_mean_fund;
run;

*eta_hat_t is demeaned at fund level for consistency with methodology;
data re03_step3; 
set re03_step3; 
au_s3_unsm_ret = av_ret_fundid + eta_hat_t_funddem + (ex_ew_ar2_resid -(1-pi1_fund_aggr_ew-pi2_fund_aggr_ew)*eta_hat_t_funddem)/(1-phi1_fund_ex_ew-phi2_fund_ex_ew);
if fund_seq le 2 then au_s3_unsm_ret = ret;
run;

*********************************************************************;
*now un-log the returns and clean data;
*re-create lags based on clean dataset;
data re04; 
set re03_step3;
ret = exp(ret) - 1;
au_unsm_ret = exp(unsm_ret) - 1;
au_s3_unsm_ret = exp(au_s3_unsm_ret) - 1;
keep ret au_unsm_ret au_s3_unsm_ret
fundid_mer yyyyq net_aum3 lag_net_aum3 fund_seq;
run;

*ensure unsmoothed mean is the same as reported return mean;
proc sort data = re04; by fundid_mer; run;
proc summary data = re04; 
var ret au_unsm_ret au_s3_unsm_ret;
output out = w_s3_av_fundret mean = v_av_ret_fund v_av_au_unsm_ret_fund v_av_au_s3_unsm_ret_fund;
by fundid_mer; 
run;

data re04;
merge re04 w_s3_av_fundret;
by fundid_mer;
unsm_ret = au_unsm_ret - v_av_au_unsm_ret_fund + v_av_ret_fund;
s3_unsm_ret = au_s3_unsm_ret - v_av_au_s3_unsm_ret_fund + v_av_ret_fund;
drop _TYPE_ _FREQ_;
run;

*check for outliers in this step;
*given nature of the data (private CRE), it is reasonable for the STD of unsmoothed returns;
*to be 1.5 to 3 times that of the reported returns;
*rule of thumb: if std(unsmoothed)/std(reported) is > 5, there was an error;
*e.g., a data reporting error, or the AR model does not capture the smoothing process;
*==> check manually;

proc sort data = re04; by fundid_mer yyyyq; run;
proc summary data = re04; 
var ret unsm_ret s3_unsm_ret;
output out = w_check_s3_outl
std = r0_std r1_std r3_std
min = max = /autoname;
by fundid_mer; run;

data w_check_s3_outl;
set w_check_s3_outl;
r1_std_ratio = r1_std/r0_std;
r3_std_ratio = r3_std/r0_std;
flag_potential_error = 0;
if r1_std_ratio >5 or r3_std_ratio >5 then flag_potential_error = 1;
drop _TYPE_;
run;

proc sort data = w_check_s3_outl; by descending r1_std_ratio; run;
proc sort data = w_check_s3_outl; by descending r3_std_ratio; run;


*Add 4 lags;
proc sort data = re04; by fundid_mer yyyyq; run;

data re04;
set re04;
*ret;
lag1_ret = lag1(ret);
if fund_seq = 1 then lag1_ret = .;
lag2_ret = lag2(ret);
if fund_seq le 2 then lag2_ret = .;
lag3_ret = lag3(ret);
if fund_seq le 3 then lag3_ret = .;
lag4_ret = lag4(ret);
if fund_seq le 4 then lag4_ret = .;
*1-step unsmoothed ret;
lag1_unsm_ret = lag1(unsm_ret);
if fund_seq = 1 then lag1_unsm_ret = .;
lag2_unsm_ret = lag2(unsm_ret);
if fund_seq le 2 then lag2_unsm_ret = .;
lag3_unsm_ret = lag3(unsm_ret);
if fund_seq le 3 then lag3_unsm_ret = .;
lag4_unsm_ret = lag4(unsm_ret);
if fund_seq le 4 then lag4_unsm_ret = .;
*3-step unsmoothed ret;
lag1_s3_unsm_ret = lag1(s3_unsm_ret);
if fund_seq = 1 then lag1_s3_unsm_ret = .;
lag2_s3_unsm_ret = lag2(s3_unsm_ret);
if fund_seq le 2 then lag2_s3_unsm_ret = .;
lag3_s3_unsm_ret = lag3(s3_unsm_ret);
if fund_seq le 3 then lag3_s3_unsm_ret = .;
lag4_s3_unsm_ret = lag4(s3_unsm_ret);
if fund_seq le 4 then lag4_s3_unsm_ret = .;
run;

*clean data (drop last 2 quarters of 1993, which are not unsmoothed);
data re04_clean;
set re04;
if yyyyq ge 19941;
run;


*run autocorrelation regression at fund level;
proc sort data = re04_clean; by fundid_mer yyyyq; run;
proc reg data = re04_clean tableout outest = re04_autoc_reg1 noprint; 
model ret = lag1_ret lag2_ret lag3_ret lag4_ret /edf;
model ret = lag1_ret /edf;
model ret = lag2_ret /edf;
model ret = lag3_ret /edf;
model ret = lag4_ret /edf;
model unsm_ret = lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret /edf;
model unsm_ret = lag1_unsm_ret /edf;
model unsm_ret = lag2_unsm_ret /edf;
model unsm_ret = lag3_unsm_ret /edf;
model unsm_ret = lag4_unsm_ret /edf;
model s3_unsm_ret = lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret /edf;
model s3_unsm_ret = lag1_s3_unsm_ret /edf;
model s3_unsm_ret = lag2_s3_unsm_ret /edf;
model s3_unsm_ret = lag3_s3_unsm_ret /edf;
model s3_unsm_ret = lag4_s3_unsm_ret /edf;
by fundid_mer; 
run; quit;

data v_re04_autoc_reg1_coeff;
set re04_autoc_reg1;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T'; 
param = _TYPE_;
*p-values;
*Count number of Positively Statistically significant lags;
*reported returns;
lag1_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag1_ret gt 1.65 then lag1_ret_p_lt_010 = 1;
lag2_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag2_ret gt 1.65 then lag2_ret_p_lt_010 = 1;
lag3_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag3_ret gt 1.65 then lag3_ret_p_lt_010 = 1;
lag4_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag4_ret gt 1.65 then lag4_ret_p_lt_010 = 1;
*unsmoothed returns;
lag1_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag1_unsm_ret gt 1.65 then lag1_unsm_ret_p_lt_010 = 1;
lag2_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag2_unsm_ret gt 1.65 then lag2_unsm_ret_p_lt_010 = 1;
lag3_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag3_unsm_ret gt 1.65 then lag3_unsm_ret_p_lt_010 = 1;
lag4_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag4_unsm_ret gt 1.65 then lag4_unsm_ret_p_lt_010 = 1;
*3-step unsmoothed returns;
lag1_s3_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag1_s3_unsm_ret gt 1.65 then lag1_s3_unsm_ret_p_lt_010 = 1;
lag2_s3_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag2_s3_unsm_ret gt 1.65 then lag2_s3_unsm_ret_p_lt_010 = 1;
lag3_s3_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag3_s3_unsm_ret gt 1.65 then lag3_s3_unsm_ret_p_lt_010 = 1;
lag4_s3_unsm_ret_p_lt_010 = 0;
if _TYPE_ = 'T' and lag4_s3_unsm_ret gt 1.65 then lag4_s3_unsm_ret_p_lt_010 = 1;
*clean up variable;
if _TYPE_ = 'PARMS' or _DEPVAR_ ne 'ret' then do;
lag1_ret_p_lt_010 = . ;
lag2_ret_p_lt_010 = . ;
lag3_ret_p_lt_010 = . ;
lag4_ret_p_lt_010 = . ;
end;
if _TYPE_ = 'PARMS' or _DEPVAR_ ne 'unsm_ret' then do;
lag1_unsm_ret_p_lt_010 = . ;
lag2_unsm_ret_p_lt_010 = . ;
lag3_unsm_ret_p_lt_010 = . ;
lag4_unsm_ret_p_lt_010 = . ;
end;
if _TYPE_ = 'PARMS' or _DEPVAR_ ne 's3_unsm_ret' then do;
lag1_s3_unsm_ret_p_lt_010 = . ;
lag2_s3_unsm_ret_p_lt_010 = . ;
lag3_s3_unsm_ret_p_lt_010 = . ;
lag4_s3_unsm_ret_p_lt_010 = . ;
end;
run;

*average across funds;
proc sort data = v_re04_autoc_reg1_coeff; by _DEPVAR_ _MODEL_ param; run;
proc summary data = v_re04_autoc_reg1_coeff; 
var lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret
lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret
lag1_ret_p_lt_010 lag2_ret_p_lt_010 lag3_ret_p_lt_010
lag4_ret_p_lt_010
lag1_unsm_ret_p_lt_010 lag2_unsm_ret_p_lt_010 lag3_unsm_ret_p_lt_010
lag4_unsm_ret_p_lt_010 
lag1_s3_unsm_ret_p_lt_010 lag2_s3_unsm_ret_p_lt_010 lag3_s3_unsm_ret_p_lt_010
lag4_s3_unsm_ret_p_lt_010 ;
output out = re04_autoc_reg1_avg_coeff 
mean = lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret
lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret
lag1_ret_p_lt_010 lag2_ret_p_lt_010 lag3_ret_p_lt_010
lag4_ret_p_lt_010
lag1_unsm_ret_p_lt_010 lag2_unsm_ret_p_lt_010 lag3_unsm_ret_p_lt_010
lag4_unsm_ret_p_lt_010 
lag1_s3_unsm_ret_p_lt_010 lag2_s3_unsm_ret_p_lt_010 lag3_s3_unsm_ret_p_lt_010
lag4_s3_unsm_ret_p_lt_010 ;
by _DEPVAR_ _MODEL_ param; 
run;

data re04_autoc_reg1_avg_coeff;
set re04_autoc_reg1_avg_coeff;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
drop _TYPE_;
run;

*organize data for output table;
data v_re_autc_fund_1;
set re04_autoc_reg1_avg_coeff;
statis = 'coeffic';
if param = 'T' then statis = 't_av';
keep model_n statis lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret
lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret;
run;

data v_re_autc_fund_2;
set re04_autoc_reg1_avg_coeff(drop = lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret
lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret);
statis = 'coeffic';
statis = 'wsig10';
if param = 'T';
lag1_ret = lag1_ret_p_lt_010;
lag2_ret = lag2_ret_p_lt_010;
lag3_ret = lag3_ret_p_lt_010;
lag4_ret = lag4_ret_p_lt_010;
lag1_unsm_ret = lag1_unsm_ret_p_lt_010;
lag2_unsm_ret = lag2_unsm_ret_p_lt_010;
lag3_unsm_ret = lag3_unsm_ret_p_lt_010;
lag4_unsm_ret = lag4_unsm_ret_p_lt_010;
lag1_s3_unsm_ret = lag1_s3_unsm_ret_p_lt_010;
lag2_s3_unsm_ret = lag2_s3_unsm_ret_p_lt_010;
lag3_s3_unsm_ret = lag3_s3_unsm_ret_p_lt_010;
lag4_s3_unsm_ret = lag4_s3_unsm_ret_p_lt_010;
keep model_n statis lag1_ret lag2_ret lag3_ret lag4_ret
lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret
lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret;
run;

data v_tab_autc_fund;
set v_re_autc_fund_2 v_re_autc_fund_1;
run;

proc sort data = v_tab_autc_fund; by model_n statis; run;

*drop empty cells;
*ret;
data w_tab_autc_fund_m1; set v_tab_autc_fund; 
if model_n = 1; mod = 'multivar'; 
keep mod statis lag1_ret lag2_ret lag3_ret lag4_ret; run;
data w_tab_autc_fund_m2; set v_tab_autc_fund; if model_n = 2; keep statis lag1_ret; run;
data w_tab_autc_fund_m3; set v_tab_autc_fund; if model_n = 3; keep statis lag2_ret; run;
data w_tab_autc_fund_m4; set v_tab_autc_fund; if model_n = 4; keep statis lag3_ret; run;
data w_tab_autc_fund_m5; set v_tab_autc_fund; if model_n = 5; keep statis lag4_ret; run;
data w_tab_autc_fund_m2345;
merge w_tab_autc_fund_m2 w_tab_autc_fund_m3 w_tab_autc_fund_m4 w_tab_autc_fund_m5; 
mod = 'univar';
run;
data w_tab_autc_fund_m12345; set w_tab_autc_fund_m1 w_tab_autc_fund_m2345; run;

*unsm_ret;
data w_tab_autc_fund_m6; set v_tab_autc_fund; 
if model_n = 6;
keep lag1_unsm_ret lag2_unsm_ret lag3_unsm_ret lag4_unsm_ret; run;
data w_tab_autc_fund_m7; set v_tab_autc_fund; if model_n = 7; keep lag1_unsm_ret; run;
data w_tab_autc_fund_m8; set v_tab_autc_fund; if model_n = 8; keep lag2_unsm_ret; run;
data w_tab_autc_fund_m9; set v_tab_autc_fund; if model_n = 9; keep lag3_unsm_ret; run;
data w_tab_autc_fund_m10; set v_tab_autc_fund; if model_n = 10; keep lag4_unsm_ret; run;
data w_tab_autc_fund_m78910;
merge w_tab_autc_fund_m7 w_tab_autc_fund_m8 w_tab_autc_fund_m9 w_tab_autc_fund_m10; 
run;
data w_tab_autc_fund_m678910; set w_tab_autc_fund_m6 w_tab_autc_fund_m78910; run;

*s3_ret;
data w_tab_autc_fund_m11; set v_tab_autc_fund; 
if model_n = 11;
keep lag1_s3_unsm_ret lag2_s3_unsm_ret lag3_s3_unsm_ret lag4_s3_unsm_ret; run;
data w_tab_autc_fund_m12; set v_tab_autc_fund; if model_n = 12; keep lag1_s3_unsm_ret; run;
data w_tab_autc_fund_m13; set v_tab_autc_fund; if model_n = 13; keep lag2_s3_unsm_ret; run;
data w_tab_autc_fund_m14; set v_tab_autc_fund; if model_n = 14; keep lag3_s3_unsm_ret; run;
data w_tab_autc_fund_m15; set v_tab_autc_fund; if model_n = 15; keep lag4_s3_unsm_ret; run;
data w_tab_autc_fund_m12131415;
merge w_tab_autc_fund_m12 w_tab_autc_fund_m13 w_tab_autc_fund_m14 w_tab_autc_fund_m15; 
run;
data w_tab_autc_fund_m1112131415; set w_tab_autc_fund_m11 w_tab_autc_fund_m12131415; run;

data tab_autc_fund_org;
merge w_tab_autc_fund_m12345(keep = mod statis)
w_tab_autc_fund_m12345(drop = mod statis) w_tab_autc_fund_m678910 w_tab_autc_fund_m1112131415;
run;

*** Table "tab_autc_fund_org" contains the Fund Level autocorrelation results reported in Figure 1 and Table 6;


*aggregate (EW) reported and unsmoothed returns;
proc sort data = re04_clean; by yyyyq; run;
proc summary data = re04_clean; 
var ret unsm_ret s3_unsm_ret;
output out = re04_aggr_ew mean = ret_ew unsm_ret_ew s3_unsm_ret_ew;
by yyyyq; 
run;

*make lags, then run regressions;
proc sort data = re04_aggr_ew; by yyyyq; run;

data re04_aggr_ew;
set re04_aggr_ew;
n_funds = _FREQ_;
*reported;
lag1_ret_ew = lag1(ret_ew);
lag2_ret_ew = lag2(ret_ew);
lag3_ret_ew = lag3(ret_ew);
lag4_ret_ew = lag4(ret_ew);
*unsmoothed;
lag1_unsm_ret_ew = lag1(unsm_ret_ew);
lag2_unsm_ret_ew = lag2(unsm_ret_ew);
lag3_unsm_ret_ew = lag3(unsm_ret_ew);
lag4_unsm_ret_ew = lag4(unsm_ret_ew);
*3-step unsmoothed;
lag1_s3_unsm_ret_ew = lag1(s3_unsm_ret_ew);
lag2_s3_unsm_ret_ew = lag2(s3_unsm_ret_ew);
lag3_s3_unsm_ret_ew = lag3(s3_unsm_ret_ew);
lag4_s3_unsm_ret_ew = lag4(s3_unsm_ret_ew);
drop _TYPE_ _FREQ_;
run;

proc sort data = re04_aggr_ew; by yyyyq; run;
proc reg data = re04_aggr_ew tableout outest = re04_aggr_ew_autoc noprint; 
model ret_ew = lag1_ret_ew lag2_ret_ew lag3_ret_ew lag4_ret_ew /edf;
model ret_ew = lag1_ret_ew /edf;
model ret_ew = lag2_ret_ew /edf;
model ret_ew = lag3_ret_ew /edf;
model ret_ew = lag4_ret_ew /edf;

model unsm_ret_ew = lag1_unsm_ret_ew lag2_unsm_ret_ew lag3_unsm_ret_ew lag4_unsm_ret_ew /edf;
model unsm_ret_ew = lag1_unsm_ret_ew /edf;
model unsm_ret_ew = lag2_unsm_ret_ew /edf;
model unsm_ret_ew = lag3_unsm_ret_ew /edf;
model unsm_ret_ew = lag4_unsm_ret_ew /edf;

model s3_unsm_ret_ew = lag1_s3_unsm_ret_ew lag2_s3_unsm_ret_ew lag3_s3_unsm_ret_ew lag4_s3_unsm_ret_ew /edf;
model s3_unsm_ret_ew = lag1_s3_unsm_ret_ew /edf;
model s3_unsm_ret_ew = lag2_s3_unsm_ret_ew /edf;
model s3_unsm_ret_ew = lag3_s3_unsm_ret_ew /edf;
model s3_unsm_ret_ew = lag4_s3_unsm_ret_ew /edf;
run; quit;

data re04_aggr_ew_autoc;
set re04_aggr_ew_autoc;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE'; 
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
run;

data v_tab_autc_agg_ew;
set re04_aggr_ew_autoc;
statis = _TYPE_;
keep statis _DEPVAR_ lag1_ret_ew lag2_ret_ew lag3_ret_ew lag4_ret_ew
lag1_unsm_ret_ew lag2_unsm_ret_ew lag3_unsm_ret_ew lag4_unsm_ret_ew
lag1_s3_unsm_ret_ew lag2_s3_unsm_ret_ew lag3_s3_unsm_ret_ew lag4_s3_unsm_ret_ew
model_n;
run;

*organize, drop empty cells;
*ret;
data w_tab_autc_agg_ew_m1; set v_tab_autc_agg_ew; 
if model_n = 1; mod = 'multivar'; 
keep mod statis lag1_ret_ew lag2_ret_ew lag3_ret_ew lag4_ret_ew; run;
data w_tab_autc_agg_ew_m2; set v_tab_autc_agg_ew; if model_n = 2; keep statis lag1_ret_ew; run;
data w_tab_autc_agg_ew_m3; set v_tab_autc_agg_ew; if model_n = 3; keep statis lag2_ret_ew; run;
data w_tab_autc_agg_ew_m4; set v_tab_autc_agg_ew; if model_n = 4; keep statis lag3_ret_ew; run;
data w_tab_autc_agg_ew_m5; set v_tab_autc_agg_ew; if model_n = 5; keep statis lag4_ret_ew; run;
data w_tab_autc_agg_ew_m2345;
merge w_tab_autc_agg_ew_m2 w_tab_autc_agg_ew_m3 w_tab_autc_agg_ew_m4 w_tab_autc_agg_ew_m5; 
mod = 'univar';
run;
data w_tab_autc_agg_ew_m12345; set w_tab_autc_agg_ew_m1 w_tab_autc_agg_ew_m2345; run;

*unsm_ret;
data w_tab_autc_agg_ew_m6; set v_tab_autc_agg_ew; 
if model_n = 6;
keep lag1_unsm_ret_ew lag2_unsm_ret_ew lag3_unsm_ret_ew lag4_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m7; set v_tab_autc_agg_ew; if model_n = 7; keep lag1_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m8; set v_tab_autc_agg_ew; if model_n = 8; keep lag2_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m9; set v_tab_autc_agg_ew; if model_n = 9; keep lag3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m10; set v_tab_autc_agg_ew; if model_n = 10; keep lag4_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m78910;
merge w_tab_autc_agg_ew_m7 w_tab_autc_agg_ew_m8 w_tab_autc_agg_ew_m9 w_tab_autc_agg_ew_m10; 
run;
data w_tab_autc_agg_ew_m678910; set w_tab_autc_agg_ew_m6 w_tab_autc_agg_ew_m78910; run;

*s3_ret;
data w_tab_autc_agg_ew_m11; set v_tab_autc_agg_ew; 
if model_n = 11;
keep lag1_s3_unsm_ret_ew lag2_s3_unsm_ret_ew lag3_s3_unsm_ret_ew lag4_s3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m12; set v_tab_autc_agg_ew; if model_n = 12; keep lag1_s3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m13; set v_tab_autc_agg_ew; if model_n = 13; keep lag2_s3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m14; set v_tab_autc_agg_ew; if model_n = 14; keep lag3_s3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m15; set v_tab_autc_agg_ew; if model_n = 15; keep lag4_s3_unsm_ret_ew; run;
data w_tab_autc_agg_ew_m12131415;
merge w_tab_autc_agg_ew_m12 w_tab_autc_agg_ew_m13 w_tab_autc_agg_ew_m14 w_tab_autc_agg_ew_m15; 
run;
data w_tab_autc_agg_ew_m1112131415; set w_tab_autc_agg_ew_m11 w_tab_autc_agg_ew_m12131415; run;

data tab_autc_agg_ew_org;
merge w_tab_autc_agg_ew_m12345(keep = mod statis)
w_tab_autc_agg_ew_m12345(drop = mod statis) w_tab_autc_agg_ew_m678910 w_tab_autc_agg_ew_m1112131415;
run;

*** Table "tab_autc_agg_ew_org" contains the Aggregate Level autocorrelation results reported in Figure 1 and Table 6;



/*************************** REGRESSIONS (compute alphas and betas) ******************************/

**** Read in and organize factors;
* Two models: RE factor only (re) equity + RE factor (eme);
data hf_factors00; *This dataset has the market factor and the RF rate;
set reautoc.hf_factors; 
run;

data emktf;
set hf_factors00;
yyyy = round(yyyymm/100,1);
mm = yyyymm - yyyy*100;
yyyyq = yyyy*10 + 1;
if mm ge 4 then yyyyq = yyyy*10 + 2;
if mm ge 7 then yyyyq = yyyy*10 + 3;
if mm ge 10 then yyyyq = yyyy*10 + 4;
log_mktrf = log(1+mkt_rf);
log_ffrf = log(1+rf);
keep yyyymm yyyyq log_mktrf log_ffrf;
run;

*compound market factor to quarterly frequency;
proc sort data = emktf; by yyyyq; run;
proc summary data = emktf; 
var log_mktrf log_ffrf;
output out = emktf_q sum = /autoname;
by yyyyq; run;

data emktf_q;
set emktf_q;
mktrf_q = exp(log_mktrf_sum) - 1;
ffrf_q = exp(log_ffrf_sum) - 1;
keep yyyyq mktrf_q ffrf_q;
run;

*public REITS factor;
data pubreits; 
set reautoc.publicreitsreturns; 
yyyy = round(date/100,1);
mm = date - yyyy*100;
yyyyq = yyyy*10 + 1;
if mm ge 4 then yyyyq = yyyy*10 + 2;
if mm ge 7 then yyyyq = yyyy*10 + 3;
if mm ge 10 then yyyyq = yyyy*10 + 4;
log_ret_equity_reits = log(1+ret_equity_reits);
log_ret_debt_reits = log(1+ret_debt_reits);
keep yyyyq log_ret_equity_reits log_ret_debt_reits;
run;

proc sort data = pubreits; by yyyyq; run;
proc summary data = pubreits; 
var log_ret_equity_reits log_ret_debt_reits;
output out = pubreits_qq
sum = /autoname;
by yyyyq; 
run;

data pubreits_qq;
set pubreits_qq;
if _FREQ_ lt 3 then delete;
ret_equity_reits_qq = exp(log_ret_equity_reits_sum) - 1;
ret_debt_reits_qq = exp(log_ret_debt_reits_sum) - 1;
keep yyyyq ret_equity_reits_qq ret_debt_reits_qq;
run;

*merge factors together;
proc sort data = emktf_q; by yyyyq; run;
proc sort data = pubreits_qq; by yyyyq; run;

data re_factors;
merge emktf_q(in=a) pubreits_qq; 
by yyyyq; 
if a;
run;

*Make Models and Rename factors;

data re_factors00a;
set re_factors;
mo1_f1 = ret_equity_reits_qq - ffrf_q;

mo2_f1 = ret_equity_reits_qq - ffrf_q;
mo2_f2 = mktrf_q;

rf = ffrf_q;
label rf = ' ';
label mo1_f1 = ' ';
label mo2_f1 = ' ';
label mo2_f2 = ' ';
keep yyyyq mo1_f1 mo2_f1 mo2_f2 rf;
run;


*attach factors to dataset with fund returns;
proc sort data = re04_clean; by yyyyq; run;
proc sort data = re_factors00a; by yyyyq; run;

data re05;
merge re04_clean(in=a) re_factors00a; 
by yyyyq; 
if a;
run;

proc sort data = re05; by fundid_mer yyyyq; run;

data re05;
set re05;
r0 = ret - rf;
r1 = unsm_ret - rf;
r3 = s3_unsm_ret - rf;
run;

*Run factor regressions fund by fund;
proc sort data = re05; by fundid_mer yyyyq; run;
proc reg data = re05 tableout outest = re05_regr_fund noprint; 
model r0 = mo1_f1 / adjrsq edf;
model r1 = mo1_f1 / adjrsq edf;
model r3 = mo1_f1 / adjrsq edf;
model r0 = mo2_f1 mo2_f2 / adjrsq edf;
model r1 = mo2_f1 mo2_f2 / adjrsq edf;
model r3 = mo2_f1 mo2_f2 / adjrsq edf;
by fundid_mer;
run; quit;

*First part of table: keep intercepts and coefficients and organize;
data v_re05_regr_fund1;
set re05_regr_fund;
if _TYPE_ = 'PARMS';
if _MODEL_ = 'MODEL1' then do;
r0_mo1_a = intercept;
r0_mo1_f1 = mo1_f1;
end;
if _MODEL_ = 'MODEL2' then do;
r1_mo1_a = intercept;
r1_mo1_f1 = mo1_f1;
end;
if _MODEL_ = 'MODEL3' then do;
r3_mo1_a = intercept;
r3_mo1_f1 = mo1_f1;
end;
if _MODEL_ = 'MODEL4' then do;
r0_mo2_a = intercept;
r0_mo2_f1 = mo2_f1;
r0_mo2_f2 = mo2_f2;
end;
if _MODEL_ = 'MODEL5' then do;
r1_mo2_a = intercept;
r1_mo2_f1 = mo2_f1;
r1_mo2_f2 = mo2_f2;
end;
if _MODEL_ = 'MODEL6' then do;
r3_mo2_a = intercept;
r3_mo2_f1 = mo2_f1;
r3_mo2_f2 = mo2_f2;
end;
run;

*Organize fund-level coefficients;
proc summary data = v_re05_regr_fund1;
var r0_mo1_a r0_mo1_f1
r1_mo1_a r1_mo1_f1
r3_mo1_a r3_mo1_f1
r0_mo2_a r0_mo2_f1 r0_mo2_f2
r1_mo2_a r1_mo2_f1 r1_mo2_f2
r3_mo2_a r3_mo2_f1 r3_mo2_f2;
output out = v_re05_regr_coeff_byfund
mean = r0_mo1_a r0_mo1_f1
r1_mo1_a r1_mo1_f1
r3_mo1_a r3_mo1_f1
r0_mo2_a r0_mo2_f1 r0_mo2_f2
r1_mo2_a r1_mo2_f1 r1_mo2_f2
r3_mo2_a r3_mo2_f1 r3_mo2_f2;
by fundid_mer;
run;

*std dev of returns;
proc sort data = re05; by fundid_mer yyyyq; run;
proc summary data = re05; 
var r0 r1 r3;
output out = v_re05_std_of_ret std = r0_std r1_std r3_std;
by fundid_mer; 
run;

data v_re05_std_of_ret; set v_re05_std_of_ret; drop _TYPE_ _FREQ_; run;

data v_re05_regr_coeff_byfund; *attach;
merge v_re05_regr_coeff_byfund v_re05_std_of_ret;
by fundid_mer; 
run;

*compute differences;
data v_re05_regr_coeff_byfund;
set v_re05_regr_coeff_byfund;
*std;
r1r0_std = r1_std - r0_std;
r3r0_std = r3_std - r0_std;
r3r1_std = r3_std - r1_std;
*model 1;
r1r0_mo1_a = r1_mo1_a - r0_mo1_a;
r3r0_mo1_a = r3_mo1_a - r0_mo1_a;
r3r1_mo1_a = r3_mo1_a - r1_mo1_a;
r1r0_mo1_f1 = r1_mo1_f1 - r0_mo1_f1;
r3r0_mo1_f1 = r3_mo1_f1 - r0_mo1_f1;
r3r1_mo1_f1 = r3_mo1_f1 - r1_mo1_f1;
*model 2;
r1r0_mo2_a = r1_mo2_a - r0_mo2_a;
r3r0_mo2_a = r3_mo2_a - r0_mo2_a;
r3r1_mo2_a = r3_mo2_a - r1_mo2_a;
r1r0_mo2_f1 = r1_mo2_f1 - r0_mo2_f1;
r3r0_mo2_f1 = r3_mo2_f1 - r0_mo2_f1;
r3r1_mo2_f1 = r3_mo2_f1 - r1_mo2_f1;
r1r0_mo2_f2 = r1_mo2_f2 - r0_mo2_f2;
r3r0_mo2_f2 = r3_mo2_f2 - r0_mo2_f2;
r3r1_mo2_f2 = r3_mo2_f2 - r1_mo2_f2;
drop _TYPE_ _FREQ_;
run;

*average coefficient;
proc summary data = v_re05_regr_coeff_byfund;
var r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std

r0_mo1_a r1_mo1_a r3_mo1_a r1r0_mo1_a r3r0_mo1_a r3r1_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1 r1r0_mo1_f1 r3r0_mo1_f1 r3r1_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a r1r0_mo2_a r3r0_mo2_a r3r1_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1 r1r0_mo2_f1 r3r0_mo2_f1 r3r1_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2 r1r0_mo2_f2 r3r0_mo2_f2 r3r1_mo2_f2;
output out = v_re05_regr_coeff_avg
mean = r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std

r0_mo1_a r1_mo1_a r3_mo1_a r1r0_mo1_a r3r0_mo1_a r3r1_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1 r1r0_mo1_f1 r3r0_mo1_f1 r3r1_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a r1r0_mo2_a r3r0_mo2_a r3r1_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1 r1r0_mo2_f1 r3r0_mo2_f1 r3r1_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2 r1r0_mo2_f2 r3r0_mo2_f2 r3r1_mo2_f2;
run;

*t-stat for mean of coefficients;
proc summary data = v_re05_regr_coeff_byfund;
var r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std

r0_mo1_a r1_mo1_a r3_mo1_a r1r0_mo1_a r3r0_mo1_a r3r1_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1 r1r0_mo1_f1 r3r0_mo1_f1 r3r1_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a r1r0_mo2_a r3r0_mo2_a r3r1_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1 r1r0_mo2_f1 r3r0_mo2_f1 r3r1_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2 r1r0_mo2_f2 r3r0_mo2_f2 r3r1_mo2_f2;
output out = v_re05_regr_coeff_t
t = r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std

r0_mo1_a r1_mo1_a r3_mo1_a r1r0_mo1_a r3r0_mo1_a r3r1_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1 r1r0_mo1_f1 r3r0_mo1_f1 r3r1_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a r1r0_mo2_a r3r0_mo2_a r3r1_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1 r1r0_mo2_f1 r3r0_mo2_f1 r3r1_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2 r1r0_mo2_f2 r3r0_mo2_f2 r3r1_mo2_f2;
run;

*organize data;
data v_re05_regr_coeff_avg; set v_re05_regr_coeff_avg; statis = 'avg_coeff'; run;
data v_re05_regr_coeff_t; set v_re05_regr_coeff_t; statis = 'tstat'; run;

*Obtain % of coefficients and intercepts that are significant in the regression (at 10%);
*For the alpha, only positive and significant;
data v_re05_regr_fund2;
set re05_regr_fund;
if _TYPE_ = 'T';
intercept_t_gt165 = 0; if intercept gt 1.65 then intercept_t_gt165 = 1;*intercept is same for all models;
*model1;
mo1_f1_sign10 = 0; if abs(mo1_f1) gt 1.65 then mo1_f1_sign10 = 1; if mo1_f1 = . then mo1_f1_sign10 = . ;
*model2;
mo2_f1_sign10 = 0; if abs(mo2_f1) gt 1.65 then mo2_f1_sign10 = 1; if mo2_f1 = . then mo2_f1_sign10 = . ;
mo2_f2_sign10 = 0; if abs(mo2_f2) gt 1.65 then mo2_f2_sign10 = 1; if mo2_f2 = . then mo2_f2_sign10 = . ;
keep _MODEL_ intercept_t_gt165 mo1_f1_sign10
mo2_f1_sign10 mo2_f2_sign10;
run;

data v_re05_regr_fund2;
set v_re05_regr_fund2;
if _MODEL_ = 'MODEL1' then do;
r0_mo1_a = intercept_t_gt165;
r0_mo1_f1 = mo1_f1_sign10;
end;
if _MODEL_ = 'MODEL2' then do;
r1_mo1_a = intercept_t_gt165;
r1_mo1_f1 = mo1_f1_sign10;
end;
if _MODEL_ = 'MODEL3' then do;
r3_mo1_a = intercept_t_gt165;
r3_mo1_f1 = mo1_f1_sign10;
end;

if _MODEL_ = 'MODEL4' then do;
r0_mo2_a = intercept_t_gt165;
r0_mo2_f1 = mo2_f1_sign10;
r0_mo2_f2 = mo2_f2_sign10;
end;
if _MODEL_ = 'MODEL5' then do;
r1_mo2_a = intercept_t_gt165;
r1_mo2_f1 = mo2_f1_sign10;
r1_mo2_f2 = mo2_f2_sign10;
end;
if _MODEL_ = 'MODEL6' then do;
r3_mo2_a = intercept_t_gt165;
r3_mo2_f1 = mo2_f1_sign10;
r3_mo2_f2 = mo2_f2_sign10;
end;
run;

*obtain % of coefficients significant at 10%;
proc summary data = v_re05_regr_fund2;
var r0_mo1_a r1_mo1_a r3_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2;
output out = v_re05_regr_perc_sign10
mean = r0_mo1_a r1_mo1_a r3_mo1_a
r0_mo1_f1 r1_mo1_f1 r3_mo1_f1

r0_mo2_a r1_mo2_a r3_mo2_a
r0_mo2_f1 r1_mo2_f1 r3_mo2_f1
r0_mo2_f2 r1_mo2_f2 r3_mo2_f2;
run;

data v_re05_regr_perc_sign10; set v_re05_regr_perc_sign10; statis = 'wsig10'; run;

***Set together to make RE_betas table;
data tab_re_betas;
set v_re05_regr_coeff_avg v_re05_regr_coeff_t v_re05_regr_perc_sign10;
drop _TYPE_;
run;

*Table "tab_re_betas" contain the results reported in Table 7 (except R^2);




/******************************************************************************************************************/
*********************** R^2 and R^2 decomposition;

data v_re05_regr_fund_rsq;
set re05_regr_fund;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
if _TYPE_ = 'PARMS';
if _MODEL_ = 'MODEL1' then do;
r0_mo1_rsq = _RSQ_;
end;
if _MODEL_ = 'MODEL2' then do;
r1_mo1_rsq = _RSQ_;
end;
if _MODEL_ = 'MODEL3' then do;
r3_mo1_rsq = _RSQ_;
end;

if _MODEL_ = 'MODEL4' then do;
r0_mo2_rsq = _RSQ_;
end;
if _MODEL_ = 'MODEL5' then do;
r1_mo2_rsq = _RSQ_;
end;
if _MODEL_ = 'MODEL6' then do;
r3_mo2_rsq = _RSQ_;
end;
run;

*organize data;
data w_re05_regr_fund_rsq_m1; set v_re05_regr_fund_rsq; if model_n = 1; keep r0_mo1_rsq; run;
data w_re05_regr_fund_rsq_m2; set v_re05_regr_fund_rsq; if model_n = 2; keep r1_mo1_rsq; run;
data w_re05_regr_fund_rsq_m3; set v_re05_regr_fund_rsq; if model_n = 3; keep r3_mo1_rsq; run;
data w_re05_regr_fund_rsq_m4; set v_re05_regr_fund_rsq; if model_n = 4; keep r0_mo2_rsq; run;
data w_re05_regr_fund_rsq_m5; set v_re05_regr_fund_rsq; if model_n = 5; keep r1_mo2_rsq; run;
data w_re05_regr_fund_rsq_m6; set v_re05_regr_fund_rsq; if model_n = 6; keep r3_mo2_rsq; run;


data v_re05_regr_fund_rsq_org;
merge w_re05_regr_fund_rsq_m1 w_re05_regr_fund_rsq_m2 w_re05_regr_fund_rsq_m3
w_re05_regr_fund_rsq_m4 w_re05_regr_fund_rsq_m5 w_re05_regr_fund_rsq_m6;

r1r0_mo1_rsq = r1_mo1_rsq - r0_mo1_rsq;
r3r0_mo1_rsq = r3_mo1_rsq - r0_mo1_rsq;
r3r1_mo1_rsq = r3_mo1_rsq - r1_mo1_rsq;

r1r0_mo2_rsq = r1_mo2_rsq - r0_mo2_rsq;
r3r0_mo2_rsq = r3_mo2_rsq - r0_mo2_rsq;
r3r1_mo2_rsq = r3_mo2_rsq - r1_mo2_rsq;
run;

*R_sqr;
proc summary data = v_re05_regr_fund_rsq_org;
var r0_mo1_rsq r1_mo1_rsq r3_mo1_rsq r1r0_mo1_rsq r3r0_mo1_rsq r3r1_mo1_rsq
r0_mo2_rsq r1_mo2_rsq r3_mo2_rsq r1r0_mo2_rsq r3r0_mo2_rsq r3r1_mo2_rsq;
output out = v_re05_regr_fund_rsq_avg
mean = r0_mo1_rsq r1_mo1_rsq r3_mo1_rsq r1r0_mo1_rsq r3r0_mo1_rsq r3r1_mo1_rsq
r0_mo2_rsq r1_mo2_rsq r3_mo2_rsq r1r0_mo2_rsq r3r0_mo2_rsq r3r1_mo2_rsq;
run;

data v_re05_regr_fund_rsq_avg;
set v_re05_regr_fund_rsq_avg;
statis = 'average';
drop _TYPE_;
run;

*t-stat for R_sqr;
proc summary data = v_re05_regr_fund_rsq_org;
var r0_mo1_rsq r1_mo1_rsq r3_mo1_rsq r1r0_mo1_rsq r3r0_mo1_rsq r3r1_mo1_rsq
r0_mo2_rsq r1_mo2_rsq r3_mo2_rsq r1r0_mo2_rsq r3r0_mo2_rsq r3r1_mo2_rsq;
output out = v_re05_regr_fund_rsq_t
t = r0_mo1_rsq r1_mo1_rsq r3_mo1_rsq r1r0_mo1_rsq r3r0_mo1_rsq r3r1_mo1_rsq
r0_mo2_rsq r1_mo2_rsq r3_mo2_rsq r1r0_mo2_rsq r3r0_mo2_rsq r3r1_mo2_rsq;
run;

data v_re05_regr_fund_rsq_t;
set v_re05_regr_fund_rsq_t;
statis = 'tstat';
drop _TYPE_;
run;

*organize table;
data w_tab_rsq_fund_part1a;
merge v_re05_regr_fund_rsq_avg(keep = statis) v_re05_regr_fund_rsq_avg(drop = statis);
run;

data w_tab_rsq_fund_part1b;
merge v_re05_regr_fund_rsq_t(keep = statis) v_re05_regr_fund_rsq_t(drop = statis);
run;

data tab_rsq_fund_part1;
set w_tab_rsq_fund_part1a w_tab_rsq_fund_part1b;
run;

*Table "tab_rsq_fund_part1" contain the R^2 results reported in Table 7;


/************************************************************************************************/
************************************ OVERALL SUMMARY STATS;
***** The following prepares sample summary stats reported in the left columns of Table 7;

data v_re05_rets;
set re05;
keep fundid_mer yyyyq fund_seq r0 r1 r3;
run;

proc sort data = v_re05_rets; by fundid_mer yyyyq; run;
proc summary data = v_re05_rets;
var r0 r1 r3;
output out = v_re05_rets_byfund
mean = std =  /autoname;
by fundid_mer; run;

proc summary data = v_re05_rets_byfund;
var r0_mean r0_stddev r1_mean r1_stddev r3_mean r3_stddev;
output out = w_tab_re_stats_fund
mean = r0_mean r0_stddev r1_mean r1_stddev r3_mean r3_stddev;
run;

data w_tab_re_stats_fund;
set w_tab_re_stats_fund;
r0_sharpe = r0_mean/r0_stddev;
r1_sharpe = r1_mean/r1_stddev;
r3_sharpe = r3_mean/r3_stddev;
drop _TYPE_;

*organize;
data tab_re_stats_fund;
merge w_tab_re_stats_fund(keep = _FREQ_ r0_mean r0_stddev r0_sharpe)
w_tab_re_stats_fund(keep = r1_mean r1_stddev r1_sharpe)
w_tab_re_stats_fund(keep = r3_mean r3_stddev r3_sharpe);
run;











*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 2: MA UNSMOOTHING MAIN/HEDGE FUNDS;
****Produces: Figure 1 (HF part), Figure 2, 4, 5, 6, Table 1, 2, 3,;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;

******** This is the Primary Code for the Hedge Fund Results;
*** This code produces results reported in Figures 1, 2, 4, 5 and 6;
*** This code produces results reported in Tables 1, 2, and 3;

*Description of variables;
*fundid_mer = fund ID;
*ret = reported return for month t;
*yyyymm = date variable;
*assets_fill = AUM of fund as of end of month t;
*fund_type = Hedge fund or CTA;
*stra = fund strategy from original database;


data hf00;
set hfautoc.hf_merge00_july2020_min5;
keep fundid_mer ret yyyymm assets_fill fund_type stra bh_fund;
run;

*lag assets;
proc sort data = hf00; by fundid_mer yyyymm; run;

data hf00;
set hf00;
lag_assets_fill = lag(assets_fill);
if fundid_mer ne lag(fundid_mer) then lag_assets_fill = .;
run;

***Attach fund classifications;
* e.g., Relative Value, Event Driven, etc;
*This follows the classification table from Joenvaara et al 2021 (also see appendix A1);
proc import out = work.Strat_manual_1
datafile = "C:\Users\&pcname.\Dropbox\Research\Hedge Funds\Unsmoothing Returns\Hedge Fund Analysis\Summer_Revision\Strat_manual_1.xlsx"
dbms =xlsx replace; getnames = yes; run;

proc sort data = hf00; by fund_type stra; run;
proc sort data = strat_manual_1; by fund_type stra; run;

data hf00;
merge hf00 strat_manual_1; 
by fund_type stra; 
categ = jkt_category;
drop jkt_category;
run;

data hf00;
set hf00;
if categ = 'Other' then delete;
aut_rank = 1;
if categ = 'Event_driven' then aut_rank = 2;
if categ = 'Multi_strategy' then aut_rank = 3;
if categ = 'Emerging_Markets' then aut_rank = 4;
if categ = 'Sector' then aut_rank = 5;
if categ = 'Long_Only' then aut_rank = 6;
if categ = 'Long_Short' then aut_rank = 7;
if categ = 'Market_Neutral' then aut_rank = 8;
if categ = 'Global Macro' then aut_rank = 9;
if categ = 'CTA' then aut_rank = 10;
if categ = 'FOF' then aut_rank = 99;
run;

data hf00;
set hf00;
if yyyymm ge 199501;
if yyyymm le 201712;
run;

*average return by fund;
proc sort data = hf00; by fundid_mer; run;

proc summary data = hf00; 
var ret;
output out = w_mean_ret_1 mean = av_ret_fundid std = std_ret_fundid;
by fundid_mer; run;

data hf00; 
merge hf00 w_mean_ret_1;
by fundid_mer; 
fundid_mer_obs = _FREQ_;
if fundid_mer_obs ge 36;
drop _FREQ_ _TYPE_;
run;

proc sort data = hf00; by fundid_mer yyyymm; run;

data hf00; 
set hf00;
by fundid_mer;
if first.fundid_mer then fund_seq = 1;
else fund_seq + 1;
run; 

*Summary stats: number of funds by category in each month, etc;
proc sort data = hf00; by yyyymm categ; run;
proc summary data = hf00; 
var ret;
output out = w_categ_mm_n mean = av_ret_categ;
by yyyymm categ; run;

data w_count_categ_funds; set hf00; keep fundid_mer ret categ; run;
proc sort data = w_count_categ_funds nodupkey; by categ fundid_mer; run;

proc summary data = w_count_categ_funds; 
var ret;
output out = w_categ_n mean = av_ret_categ;
by categ; run;



/******************************************************************************************************************/
**** Part 1: 1-step unsmoothing MA method, as in GLM(2004);

*Note: we want to compare the returns (reported, 1-step and 3-steap) across the same set of funds;
*Therefore, if the MA does no converge for a given fund, do not unsmooth its returns, but do not drop the fund;


*start by de-meaning fund return;
data hf00; 
set hf00; 
dem_ret = ret - av_ret_fundid;
run;

*MA with 3 lags;
proc sort data = hf00; by fundid_mer yyyymm; run;
proc arima data= hf00;
identify var = dem_ret noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= arima_ma3_est OUTSTAT=arima_ma3_diag noprint;
*forecast noprint;
by fundid_mer;
run;
quit;

data arima_ma3_est; *normalize coefficients;
set arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
keep fundid_mer ma3_STATUS_ ma3_theta_0 ma3_theta_1 ma3_theta_2 ma3_theta_3 ma3_theta_sum ma3_theta_sum_norm;
run;

*attach thetas to main dataset;
data hf01;
merge hf00 arima_ma3_est;
by fundid_mer;
run;


***In case of non-convergence, set to MA(0) (i.e., do not unsmooth, but do not drop fund to maintain same sample across methods);
*Please see "Appendix for Code 2" for explanation on handling convergence/outliers from MA estimation;
data hf01;
set hf01;
ma3_gt_1_5 = 0; 
if ma3_theta_0 gt 1.25 or ma3_theta_1 gt 1.25 or ma3_theta_2 gt 1.25 or ma3_theta_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_theta_0 le -0.45 or ma3_theta_1 le -0.45 or ma3_theta_2 le -0.45 or ma3_theta_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_theta_0 = ma3_theta_0; 
sel_theta_1 = ma3_theta_1; 
sel_theta_2 = ma3_theta_2; 
sel_theta_3 = ma3_theta_3;
if ma3_gt_1_5 = 1 then do;
sel_theta_0 = 1; 
sel_theta_1 = 0; 
sel_theta_2 = 0; 
sel_theta_3 = 0;
end;
if ma3_status = 1 then do;
sel_theta_0 = 1; 
sel_theta_1 = 0; 
sel_theta_2 = 0; 
sel_theta_3 = 0;
end;
run;

data v_check_thetas; set hf01; run;
proc sort data = v_check_thetas nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = v_check_thetas; 
var sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
output out = v_av_thetas
mean = sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
by aut_rank categ; 
run;

/********************************************************/
*Back out 1-step unsmoothed returns;


proc sort data = hf01; by fundid_mer yyyymm; run;

%macro back_out_rets;

data hf01_loop;
set hf01;
backed_ret_aic = dem_ret;
keep backed_ret_aic
dem_ret fundid_mer yyyymm fund_seq av_ret_fundid
sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
run;

proc sort data = hf01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *MA3 loop; 
*make sure to set this to the max length of fund return time series (23 years in our empirical application, so 23 x 12 = 276;

data hf01_loop;
set hf01_loop;
lag1_backed_ret_aic = lag1(backed_ret_aic);
lag2_backed_ret_aic = lag2(backed_ret_aic);
lag3_backed_ret_aic = lag3(backed_ret_aic);

if fund_seq = &i then backed_ret_aic = (dem_ret - sel_theta_1*lag1_backed_ret_aic - sel_theta_2*lag2_backed_ret_aic - sel_theta_3*lag3_backed_ret_aic)/sel_theta_0;
run;

%end;

%mend back_out_rets;
%back_out_rets;

*clean dataset and add back mean;
data hf01_loop;
set hf01_loop;
temp_backed_ret_aic = backed_ret_aic + av_ret_fundid;
keep fundid_mer yyyymm temp_backed_ret_aic;
run;

proc sort data = hf01_loop; by fundid_mer yyyymm; run;
proc sort data = hf01; by fundid_mer yyyymm; run;

data hf02;
merge hf01 hf01_loop; 
by fundid_mer yyyymm; 
run;

*Adjust mean;
proc sort data = hf02; by fundid_mer; run;
proc summary data = hf02; 
var ret temp_backed_ret_aic;
output out = v_hf02_ret_mean
mean = std = /autoname;
by fundid_mer; 
run;

data hf02;
merge hf02 v_hf02_ret_mean;
by fundid_mer; 
backed_ret_aic = temp_backed_ret_aic + ret_mean - temp_backed_ret_aic_mean;
drop _TYPE_ _FREQ_;
run;


*Now aggregate returns;
proc sort data = hf02; by aut_rank categ yyyymm; run;
proc summary data = hf02; 
var ret backed_ret_aic;
output out = hf2_ewret mean = ret_ew backed_ret_aic_ew;
by aut_rank categ yyyymm; 
run;

proc sort data = hf2_ewret; by categ yyyymm; run;

data hf2_ewret;
set hf2_ewret;
funds_categ_mm = _FREQ_;
lag1_ret_ew = lag1(ret_ew);
if lag(categ) ne categ then lag1_ret_ew = . ;
lag2_ret_ew = lag2(ret_ew);
if lag2(categ) ne categ then lag2_ret_ew = . ;
lag1_backed_ret_aic_ew = lag1(backed_ret_aic_ew);
if lag(categ) ne categ then lag1_backed_ret_aic_ew = . ;
lag2_backed_ret_aic_ew = lag2(backed_ret_aic_ew);
if lag2(categ) ne categ then lag2_backed_ret_aic_ew = . ;
drop _TYPE_ _FREQ_;
run;


/****************************************************************************************************************/
********** 3-step Unsmoothing;
**** FIRST STEP: Obtain Aggregate Economic (unsmoothed) Returns;

data s1ag00;
set hf2_ewret;
keep aut_rank categ yyyymm ret_ew funds_categ_mm;
run;

*demean aggregate category returns;
proc sort data = s1ag00; by categ; run;
proc summary data = s1ag00; 
var ret_ew funds_categ_mm;
output out = w_mean_ew_ret mean = av_aggrret_categ av_funds_categ_mm;
by categ; run;

data s1ag00;
merge s1ag00 w_mean_ew_ret;
by categ;
dem_catret_ew = ret_ew - av_aggrret_categ;
drop _TYPE_ _FREQ_ funds_categ_mm av_funds_categ_mm;
run;

*category sequence #;
proc sort data = s1ag00; by categ yyyymm; run;
data s1ag00;
set s1ag00;
by categ;
if first.categ then categ_seq = 1;
else categ_seq + 1;
run; 

*Apply MA smothing;
*MA with 3 lags;
proc sort data = s1ag00; by categ yyyymm; run;
proc arima data= s1ag00;
identify var = dem_catret_ew noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= ag_arima_ma3_est OUTSTAT=ag_arima_ma3_diag noprint;
*forecast noprint;
by categ;
run;
quit;

*Estimated PAIs for MA(3) case;
data ag_arima_ma3_est;
set ag_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_pai_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_pai_0 = 1/ma3_pai_sum;
ma3_pai_1 = -ma1_1/ma3_pai_sum;
ma3_pai_2 = -ma1_2/ma3_pai_sum;
ma3_pai_3 = -ma1_3/ma3_pai_sum;
ma3_pai_sum_norm = ma3_pai_0 + ma3_pai_1 + ma3_pai_2;
keep categ ma3_STATUS_ ma3_pai_0 ma3_pai_1 ma3_pai_2 ma3_pai_3 ma3_pai_sum;
run;

*attach pais to main dataset;
proc sort data = ag_arima_ma3_est; by categ; run;
proc sort data = s1ag00; by categ; run;

data s1ag01;
merge s1ag00 ag_arima_ma3_est;
by categ;
run;


data s1ag01;
set s1ag01;
ma3_gt_1_5 = 0; 
if ma3_pai_0 gt 1.25 or ma3_pai_1 gt 1.25 or ma3_pai_2 gt 1.25 or ma3_pai_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_pai_0 le -0.45 or ma3_pai_1 le -0.45 or ma3_pai_2 le -0.45 or ma3_pai_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_pai_0 = 1; sel_pai_1 = 0; sel_pai_2 = 0; sel_pai_3 = 0;
sel_pai_0 = ma3_pai_0;
sel_pai_1 = ma3_pai_1;
sel_pai_2 = ma3_pai_2;
sel_pai_3 = ma3_pai_3;
if ma3_gt_1_5 = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
if ma3_status = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
run;

*Aggregate MA coefficients (pais);
data s1_ag_pais; set s1ag01; keep aut_rank categ sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3; run;
proc sort data = s1_ag_pais nodupkey; by aut_rank categ; run;


*back out estimated unsmoothed returns for aggregate returns;
proc sort data = s1ag01; by categ yyyymm; run;

%macro back_out_catrets;

data s1ag01_loop;
set s1ag01;
backed_catret_aic = dem_catret_ew;
keep aut_rank categ categ_seq backed_catret_aic dem_catret_ew yyyymm av_aggrret_categ
sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;

%do i = 4 %to 276; *MA3;

data s1ag01_loop;
set s1ag01_loop;
lag1_backed_catret_aic = lag1(backed_catret_aic);
lag2_backed_catret_aic = lag2(backed_catret_aic);
lag3_backed_catret_aic = lag3(backed_catret_aic);

if categ_seq = &i then backed_catret_aic = (dem_catret_ew - sel_pai_1*lag1_backed_catret_aic - sel_pai_2*lag2_backed_catret_aic - sel_pai_3*lag3_backed_catret_aic)/sel_pai_0;
run;

%end;

%mend back_out_catrets;
%back_out_catrets;


*check average backed ret by category;
proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop mean = /autoname;
by categ;
run;

*Residuals by each category sum to 0;
data s1ag01_loop; 
merge s1ag01_loop check_av_s1ag01_loop;
by categ; 
temp_backed_catret_aic = backed_catret_aic;
drop backed_catret_aic;
run;

data s1ag01_loop; 
set s1ag01_loop; 
backed_catret_aic = temp_backed_catret_aic - backed_catret_aic_mean;
drop backed_catret_aic_mean temp_backed_catret_aic _FREQ_ _TYPE_;
run;

proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop_v2 mean = /autoname;
by categ;
run;

*Clean dataset and add back mean;
data s1ag01_loop;
set s1ag01_loop;
dem_backed_catret_aic = backed_catret_aic;
backed_catret_aic = backed_catret_aic + av_aggrret_categ;
keep categ yyyymm backed_catret_aic dem_backed_catret_aic;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;
proc sort data = s1ag01; by categ yyyymm; run;

data s1ag02;
merge s1ag01 s1ag01_loop; 
by categ yyyymm; 
run;


*************************************************************;
*****************************************;
*Continue to step 2;

***********************************************************************************;
**** SECOND STEP: GET FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);
** MA will use 3 lags of excess category returns as covariates;

data s2fund00; *fund-level data;
set hf00;
keep ret yyyymm fundid_mer categ fund_seq av_ret_fundid lag_assets_fill aut_rank;
run;

data w_au_ewcatert; *unsmoothed EW series;
set s1ag02;
catret_ew = ret_ew;
ag_aic_adj1_win = ag_aic_adj1_win;
keep categ yyyymm catret_ew backed_catret_aic ag_aic_adj1_win;
run;

proc sort data = w_au_ewcatert; by categ yyyymm; run;
data w_au_ewcatert;
set w_au_ewcatert;
*1 lag;
lag1_backed_catret_aic = lag1(backed_catret_aic);
if categ ne lag1(categ) then lag1_backed_catret_aic = . ;
*2 lag;
lag2_backed_catret_aic = lag2(backed_catret_aic);
if categ ne lag2(categ) then lag2_backed_catret_aic = . ;
*3 lag;
lag3_backed_catret_aic = lag3(backed_catret_aic);
if categ ne lag3(categ) then lag3_backed_catret_aic = . ;
run;

*attach category-month returns and demean;
proc sort data = s2fund00; by categ yyyymm; run;
proc sort data = w_au_ewcatert; by categ yyyymm; run;

data s2fund00;
merge s2fund00 w_au_ewcatert; 
by categ yyyymm; 
ret_excat = ret - catret_ew;
run;

*Obtain fund's average return in excess of the category return;
*also demean unsmoothed EW categ return;

proc sort data = s2fund00; by fundid_mer; run;
proc summary data = s2fund00; 
var ret_excat backed_catret_aic
lag1_backed_catret_aic
lag2_backed_catret_aic
lag3_backed_catret_aic;
output out = w_mean_excat_ret 
mean = av_retexcat_fundid av_backed_catret_aic_fund
av_lag1_backed_catret_aic_fund
av_lag2_backed_catret_aic_fund
av_lag3_backed_catret_aic_fund;
by fundid_mer; 
run;

data s2fund00;
merge s2fund00 w_mean_excat_ret;
by fundid_mer; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_catret_aic = backed_catret_aic - av_backed_catret_aic_fund;
dem_lag1_backed_catret_aic = lag1_backed_catret_aic - av_lag1_backed_catret_aic_fund;
dem_lag2_backed_catret_aic = lag2_backed_catret_aic - av_lag2_backed_catret_aic_fund;
dem_lag3_backed_catret_aic = lag3_backed_catret_aic - av_lag3_backed_catret_aic_fund;
run;

*check average dem_ret_excat;
proc summary data = s2fund00;
var dem_ret_excat dem_backed_catret_aic dem_lag1_backed_catret_aic;
output out = check_avg_dem_ret_excat mean = /autoname;
by fundid_mer; 
run;

*Estimate MA on returns in excess of category returns;
*MA with 3 lags;
proc sort data = s2fund00; by fundid_mer yyyymm; run;
proc arima data= s2fund00;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est OUTSTAT= excat_arima_ma3_diag noprint;
*forecast noprint;
by fundid_mer;
run;
quit;

*MA3;
data excat_arima_ma3_est;
set excat_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_phi_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_phi_0 = 1/ma3_phi_sum;
ma3_phi_1 = -ma1_1/ma3_phi_sum;
ma3_phi_2 = -ma1_2/ma3_phi_sum;
ma3_phi_3 = -ma1_3/ma3_phi_sum;
ma3_phi_sum_norm = ma3_phi_0 + ma3_phi_1 + ma3_phi_2;
keep fundid_mer ma3_STATUS_ ma3_phi_0 ma3_phi_1 ma3_phi_2 ma3_phi_3 ma3_phi_sum;
run;

*attach phis to main dataset;
proc sort data = s2fund00; by fundid_mer; run;
proc sort data = excat_arima_ma3_est; by fundid_mer; run;

data s2fund01;
merge s2fund00 excat_arima_ma3_est;
by fundid_mer;
run;


data s2fund01;
set s2fund01;
ma3_gt_1_5 = 0; 
if ma3_phi_0 gt 1.25 or ma3_phi_1 gt 1.25 or ma3_phi_2 gt 1.25 or ma3_phi_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_phi_0 le -0.45 or ma3_phi_1 le -0.45 or ma3_phi_2 le -0.45 or ma3_phi_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_phi_0 = ma3_phi_0; 
sel_phi_1 = ma3_phi_1; 
sel_phi_2 = ma3_phi_2; 
sel_phi_3 = ma3_phi_3;
if ma3_gt_1_5 = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
if ma3_status = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
run;

***Aggregate MA coefficients (pais);
data w_s2_res_phis; set s2fund01; keep aut_rank categ fundid_mer sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3; run;
proc sort data = w_s2_res_phis nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis
mean = sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
by aut_rank categ; 
run;

*check outliers min max;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_minmax
min = max = /autoname;
by aut_rank categ; 
run;

*check outliers 1 and 99%;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_p1p99
p1 = p99 = /autoname;
by aut_rank categ; 
run;

data tab_ma_coeff_all;
merge v_av_thetas s1_ag_pais s2_res_phis;
by aut_rank categ; 
drop _TYPE_ _FREQ_;
run;

*Inspect unsmoothing coefficients for 1-step and 3-step method;
/*
aut_rank	categ	sel_theta_0	sel_theta_1	sel_theta_2	sel_theta_3	sel_pai_0	sel_pai_1	sel_pai_2	sel_pai_3	sel_phi_0	sel_phi_1	sel_phi_2	sel_phi_3
1	Relative_Value	0.72106	0.15158	0.077103	0.050256	0.56628	0.28817	0.14191	0.003642	0.81326	0.086125	0.063406	0.037214
2	Event_driven	0.73847	0.15177	0.060462	0.049295	0.57558	0.24405	0.11108	0.069287	0.84862	0.086938	0.029386	0.035061
3	Multi_strategy	0.75623	0.14868	0.068351	0.026743	0.5502	0.21617	0.14321	0.090428	0.85001	0.099439	0.03669	0.013858
4	Emerging_Markets	0.80089	0.12112	0.04217	0.035817	0.6871	0.2214	0.06568	0.025828	0.90357	0.063868	0.024479	0.008087
5	Sector	0.85101	0.09178	0.042634	0.014571	0.77257	0.1604	0.03889	0.028138	0.88543	0.063769	0.031801	0.018999
6	Long_Only	0.85168	0.08411	0.026132	0.038073	0.77374	0.16522	0.03592	0.025116	0.91753	0.038414	0.010876	0.033185
7	Long_Short	0.85337	0.08142	0.033172	0.03204	0.72163	0.14941	0.07546	0.053493	0.90478	0.049302	0.018424	0.027498
8	Market_Neutral	0.89176	0.06327	0.036533	0.008436	0.81723	0.14408	0.05878	-0.020093	0.88408	0.053234	0.029386	0.033303
9	Global Macro	0.91902	0.06541	0.014903	0.000669	0.95994	0.07266	-0.02859	-0.004007	0.92847	0.063402	0.012399	-0.004275
10	CTA	0.96333	0.02125	0.014271	0.001146	1.04364	0.00365	-0.0494	0.002111	0.94058	0.03168	0.020696	0.007041
*/


*back out estimated unsmoothed excess returns;
proc sort data = s2fund01; by fundid_mer yyyymm; run;

%macro back_out_exrets;

data s2fund01_loop;
set s2fund01;
backed_ret_excat_aic = dem_ret_excat;
keep backed_ret_excat_aic dem_ret_excat fundid_mer yyyymm fund_seq av_ret_fundid
sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3 av_retexcat_fundid;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *MA3;

data s2fund01_loop;
set s2fund01_loop;
lag1_backed_ret_excat_aic = lag1(backed_ret_excat_aic);
lag2_backed_ret_excat_aic = lag2(backed_ret_excat_aic);
lag3_backed_ret_excat_aic = lag3(backed_ret_excat_aic);

if fund_seq = &i then backed_ret_excat_aic = (dem_ret_excat - sel_phi_1*lag1_backed_ret_excat_aic - sel_phi_2*lag2_backed_ret_excat_aic - sel_phi_3*lag3_backed_ret_excat_aic)/sel_phi_0;
run;

%end;

%mend back_out_exrets;
%back_out_exrets;


*clean dataset and keep adjusted residual;
data s2fund01_loop;
set s2fund01_loop;
res_backed_ret_excat_aic = backed_ret_excat_aic;
keep fundid_mer yyyymm res_backed_ret_excat_aic;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;
proc sort data = s2fund01; by fundid_mer yyyymm; run;

data s2fund02;
merge s2fund01 s2fund01_loop; 
by fundid_mer yyyymm; 
run;

proc summary data = s2fund02;
var dem_ret_excat res_backed_ret_excat_aic;
output out = check_step2_mean_excatret mean = /autoname;
by fundid_mer; 
run;

***********************************************************************;
**** STEP 3 : Add up residual from aggregate MA(3) and excess ret MA(3);
*Aggregate category-level residuals;
data s1_resid;
set s1ag02;
keep categ yyyymm av_aggrret_categ 
backed_catret_aic dem_backed_catret_aic;
run;

*Fund-level cat-excess return residuals;
data s2_resid;
set s2fund02;
keep fundid_mer yyyymm dem_ret_excat res_backed_ret_excat_aic;
run;

*Merge to main dataset;
proc sort data = hf02; by categ yyyymm; run;
proc sort data = s1_resid; by categ yyyymm; run;

data s3_00;
merge hf02 s1_resid; 
by categ yyyymm; 
run;

proc sort data = s3_00; by fundid_mer yyyymm; run;
proc sort data = s2_resid; by fundid_mer yyyymm; run;

data s3_00;
merge s3_00 s2_resid; 
by fundid_mer yyyymm;
run;

*Obtain 3-step unsmoothing returns (and add back mean);
data s3_00;
set s3_00;
s3_uns_ret_aic_temp = dem_backed_catret_aic + res_backed_ret_excat_aic + av_ret_fundid;
run;

*Ensure mean return is the same across methodologies;
proc sort data = s3_00; by fundid_mer; run;
proc summary data = s3_00; 
var s3_uns_ret_aic_temp av_ret_fundid;
output out = w_adj_s3_uns mean = /autoname;
by fundid_mer; 
run;

data s3_00;
merge s3_00 w_adj_s3_uns;
by fundid_mer; 
s3_uns_ret_aic = s3_uns_ret_aic_temp - s3_uns_ret_aic_temp_mean + av_ret_fundid_mean;
drop s3_uns_ret_aic_temp_mean av_ret_fundid_mean _TYPE_ _FREQ_;
run;


*Check averages;
proc sort data = s3_00; by categ; run;
proc summary data = s3_00;
var ret av_ret_fundid backed_ret_aic s3_uns_ret_aic 
dem_backed_catret_aic res_backed_ret_excat_aic;
output out = check_avret_s3_00_all mean = /autoname;
by categ;
run;


*Clean and save data;
data s3_clean_minaum5_ma3fix;
set s3_00;
keep ret yyyymm assets_fill bh_fund fundid_mer categ lag_assets_fill fundid_mer_obs fund_seq
backed_ret_aic s3_uns_ret_aic backed_catret_aic;
run;


************** END OF UNSMOOTHING CODE ********************;
/********************************************************************************************/
/********************************************************************************************/



/********************************************************************************************/
/********************************************************************************************/

***************************** PROCEED TO ANALYSIS OF UNSMOOTHED RETURNS;
*(housekeeping comment: the following is based on code "HF_output" before it was merged here in order to publish the code);

*Rename returns to simplify variable names;
data s3_00a; 
set s3_00; 
strat_il_gr = 'High';
if aut_rank gt 3 then strat_il_gr = 'Mid';
if aut_rank gt 8 then strat_il_gr = 'Low';
glm_ret = backed_ret_aic; 
s3_ret = s3_uns_ret_aic; 
uns_catret = backed_catret_aic;
keep fundid_mer bh_fund assets_fill lag_assets_fill yyyymm
ret categ aut_rank strat_il_gr fundid_mer_obs fund_seq
glm_ret s3_ret uns_catret;
run;

*attach imputed add date (per Jorion and Schwartz 2019);
data tass_imputed_add_date1; 
set hfautoc.tass_imputed_add_date; 
fundid_mer = fundid*100 + 1;
keep fundid_mer imputed_add_date;
run;

data bh_imputed_add_date1; 
set hfautoc.bh_imputed_add_date; 
fundid_mer = fund_id*100 + 2;
keep fundid_mer imputed_add_date;
run;

data imputed_add_dates; 
set tass_imputed_add_date1 bh_imputed_add_date1; 
run;

proc sort data = s3_00a; by fundid_mer; run;
proc sort data = imputed_add_dates; by fundid_mer; run;

data s3_00a; 
merge s3_00a(in=a) imputed_add_dates;
by fundid_mer; 
if a;
run;

*Read HF factors;
data hffact;
set hfautoc.hf_factors;
fh1 = sp500_rf;
fh2 = size_spread;
fh3 = emerg_mkt_rf;
fh4 = FS_bond_mkt;
fh5 = FS_credit_sprd;
fh6 = PTFSBD;
fh7 = PTFSFX;
fh8 = PTFSCOM;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 rf;
run;

*merge;
proc sort data = s3_00a; by yyyymm; run;
proc sort data = hffact; by yyyymm; run;

data s3_00a; 
merge s3_00a(in=a) hffact;
by yyyymm; 
if a;
run;

data s3_00a; 
set s3_00a; 
retrf = ret - rf;
glm_retrf = glm_ret - rf;
s3_retrf = s3_ret - rf;
run;

*Number of funds and T for each strategy;
*This is the sample size (N and T) reported in Table 1;
proc sort data = s3_00a; by aut_rank categ fundid_mer yyyymm; run;
proc summary data = s3_00a; 
var ret;
output out = v_count_obs_fund n = n_ret;
by aut_rank categ fundid_mer; 
run;
proc summary data = v_count_obs_fund; 
var n_ret;
output out = v_count_obs_categ n = n_funds mean = avg_T;
by aut_rank categ; 
run;

**************************************************;
******************Autocorrelation Tables;
**** These autocorrelations are reported in Table 2 and 3;
*Up to 4 lags;

data lagret_fund;
set s3_00a;
keep ret yyyymm fundid_mer categ fund_seq aut_rank strat_il_gr
glm_ret s3_ret;
run;

*Repeat 3 times, one for each version of the return;
*RET (return as reported/observed);

%let f_rets = ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg; *compute % of t > 1.65;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;


*Aggregate autocorrelations;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;


****************************;
*GLM_RET;

%let f_rets = glm_ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;


*Aggregate Autocorrelations;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;


****************************;
*S3 RET;
%let f_rets = s3_ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;


*Aggregate Autocorrelations;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;


********************************************************************************************************************************;
******************* Fung-Hsieh Regressions;
*run fund-by-fund regressions;

*First 3 observations are not unsmoothed, drop to be consistent across the 3 methodologies;
data s3_00_select;
set s3_00a;
if fund_seq gt 3;
run;

proc sort data = s3_00_select; by aut_rank categ fundid_mer yyyymm; run;

proc reg data = s3_00_select outest = v_s3_fund_fh00 noprint tableout;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_s3_fund_fh00_coef_r0;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
r0_fh_int = intercept;
r0_fh1 = fh1;
r0_fh2 = fh2;
r0_fh3 = fh3;
r0_fh4 = fh4;
r0_fh5 = fh5;
r0_fh6 = fh6;
r0_fh7 = fh7;
r0_fh8 = fh8;
keep aut_rank categ fundid_mer 
r0_fh_int r0_fh1 r0_fh2 r0_fh3 r0_fh4 r0_fh5 r0_fh6 r0_fh7 r0_fh8;
run;

data v_s3_fund_fh00_coef_r1;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
r1_fh_int = intercept;
r1_fh1 = fh1;
r1_fh2 = fh2;
r1_fh3 = fh3;
r1_fh4 = fh4;
r1_fh5 = fh5;
r1_fh6 = fh6;
r1_fh7 = fh7;
r1_fh8 = fh8;
keep aut_rank categ fundid_mer 
r1_fh_int r1_fh1 r1_fh2 r1_fh3 r1_fh4 r1_fh5 r1_fh6 r1_fh7 r1_fh8;
run;

data v_s3_fund_fh00_coef_r3;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
r3_fh_int = intercept;
r3_fh1 = fh1;
r3_fh2 = fh2;
r3_fh3 = fh3;
r3_fh4 = fh4;
r3_fh5 = fh5;
r3_fh6 = fh6;
r3_fh7 = fh7;
r3_fh8 = fh8;
keep aut_rank categ fundid_mer 
r3_fh_int r3_fh1 r3_fh2 r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8;
run;

data v_s3_fund_fh00_coef_all;
merge v_s3_fund_fh00_coef_r0 v_s3_fund_fh00_coef_r1 v_s3_fund_fh00_coef_r3;
by aut_rank categ fundid_mer;
run;

*Attach betas to monthly returns in order to obtain alphas;
*drop all backfilled observations;
data v_rets;
set s3_00_select;
backfilled_i = 0;
if yyyymm lt imputed_add_date then backfilled_i = 1;
keep retrf glm_retrf s3_retrf fund_seq yyyymm
fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
aut_rank categ fundid_mer backfilled_i; 
run;

proc sort data = v_rets; by aut_rank categ fundid_mer; run;
proc sort data = v_s3_fund_fh00_coef_all; by aut_rank categ fundid_mer; run;

data v_rets_betas;
merge v_rets v_s3_fund_fh00_coef_all;
by aut_rank categ fundid_mer;
run;

*calculate monthly alphas;
data v_rets_betas;
set v_rets_betas;
r0_fh_a = retrf -r0_fh1*fh1 -r0_fh2*fh2 -r0_fh3*fh3 -r0_fh4*fh4 
-r0_fh5*fh5 -r0_fh6*fh6 -r0_fh7*fh7 -r0_fh8*fh8;
r1_fh_a = glm_retrf -r1_fh1*fh1 -r1_fh2*fh2 -r1_fh3*fh3 -r1_fh4*fh4 
-r1_fh5*fh5 -r1_fh6*fh6 -r1_fh7*fh7 -r1_fh8*fh8;
r3_fh_a = s3_retrf -r3_fh1*fh1 -r3_fh2*fh2 -r3_fh3*fh3 -r3_fh4*fh4 
-r3_fh5*fh5 -r3_fh6*fh6 -r3_fh7*fh7 -r3_fh8*fh8;
run;

data v_rets_betas_gt12mm; *drop backfilled;
set v_rets_betas;
if backfilled_i = 1 then delete;
r0 = retrf;
r1 = glm_retrf;
r3 = s3_retrf;
run;

*Calculate average alpha and average t by fund, also average std dev of returns;
proc sort data = v_rets_betas_gt12mm; by aut_rank categ fundid_mer; run;
proc summary data = v_rets_betas_gt12mm; 
var r0_fh_a r1_fh_a r3_fh_a r0 r1 r3;
output out = fund_fhalpha_avg
mean = r0_fh_a r1_fh_a r3_fh_a del1 del2 del3
std = del4 del5 del6 r0_std r1_std r3_std;
by aut_rank categ fundid_mer; 
run;
data fund_fhalpha_avg; set fund_fhalpha_avg; drop del1 del2 del3 del4 del5 del6; run;

proc summary data = v_rets_betas_gt12mm; 
var r0_fh_a r1_fh_a r3_fh_a;
output out = fund_fhalpha_t
t = r0_fh_a r1_fh_a r3_fh_a;
by aut_rank categ fundid_mer; 
run;

*attach fund alpha to file with coefficients;

data s3_fund_fh00_coef_all;
merge fund_fhalpha_avg v_s3_fund_fh00_coef_all;
by aut_rank categ fundid_mer;
drop _TYPE_ _FREQ_;
run;



data s3_fund_fh00_coef_all; *differences in coefficients;
set s3_fund_fh00_coef_all;
r1r0_fh_a = r1_fh_a - r0_fh_a; r3r0_fh_a = r3_fh_a - r0_fh_a; r3r1_fh_a = r3_fh_a - r1_fh_a; 
r1r0_std = r1_std - r0_std; r3r0_std = r3_std - r0_std; r3r1_std = r3_std - r1_std; 
r1r0_fh1 = r1_fh1 - r0_fh1; r3r0_fh1 = r3_fh1 - r0_fh1; r3r1_fh1 = r3_fh1 - r1_fh1; 
r1r0_fh2 = r1_fh2 - r0_fh2; r3r0_fh2 = r3_fh2 - r0_fh2; r3r1_fh2 = r3_fh2 - r1_fh2; 
r1r0_fh3 = r1_fh3 - r0_fh3; r3r0_fh3 = r3_fh3 - r0_fh3; r3r1_fh3 = r3_fh3 - r1_fh3; 
r1r0_fh4 = r1_fh4 - r0_fh4; r3r0_fh4 = r3_fh4 - r0_fh4; r3r1_fh4 = r3_fh4 - r1_fh4; 
r1r0_fh5 = r1_fh5 - r0_fh5; r3r0_fh5 = r3_fh5 - r0_fh5; r3r1_fh5 = r3_fh5 - r1_fh5; 
r1r0_fh6 = r1_fh6 - r0_fh6; r3r0_fh6 = r3_fh6 - r0_fh6; r3r1_fh6 = r3_fh6 - r1_fh6; 
r1r0_fh7 = r1_fh7 - r0_fh7; r3r0_fh7 = r3_fh7 - r0_fh7; r3r1_fh7 = r3_fh7 - r1_fh7; 
r1r0_fh8 = r1_fh8 - r0_fh8; r3r0_fh8 = r3_fh8 - r0_fh8; r3r1_fh8 = r3_fh8 - r1_fh8; 
run;

*Mean value by category;
proc sort data = s3_fund_fh00_coef_all; by aut_rank categ fundid_mer; run;

proc summary data = s3_fund_fh00_coef_all;
var r0_fh_a r1_fh_a r3_fh_a r1r0_fh_a r3r0_fh_a r3r1_fh_a
r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
output out = s3_fund_fh00_coef_all_av
mean = r0_fh_a r1_fh_a r3_fh_a r1r0_fh_a r3r0_fh_a r3r1_fh_a
r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
by aut_rank categ;
run;

data s3_fund_fh00_coef_all_av; 
set s3_fund_fh00_coef_all_av; 
stat = 'average'; 
drop _TYPE_ _FREQ_; 
run;

proc summary data = s3_fund_fh00_coef_all;
var r0_fh_a r1_fh_a r3_fh_a r1r0_fh_a r3r0_fh_a r3r1_fh_a
r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
output out = s3_fund_fh00_coef_all_t
t = r0_fh_a r1_fh_a r3_fh_a r1r0_fh_a r3r0_fh_a r3r1_fh_a
r0_std r1_std r3_std r1r0_std r3r0_std r3r1_std
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
by aut_rank categ;
run;

data s3_fund_fh00_coef_all_t; 
set s3_fund_fh00_coef_all_t; 
stat = 'tstat'; 
drop _TYPE_ _FREQ_; 
run;

*% of coefficients that are significant at 10% level;

data v_s3_fund_fh00_t_r0;
set v_s3_fund_fh00;
if _TYPE_ = 'T';
if _DEPVAR_ = 'retrf';
r0_fh1 = fh1;
r0_fh2 = fh2;
r0_fh3 = fh3;
r0_fh4 = fh4;
r0_fh5 = fh5;
r0_fh6 = fh6;
r0_fh7 = fh7;
r0_fh8 = fh8;
keep aut_rank categ fundid_mer 
r0_fh1 r0_fh2 r0_fh3 r0_fh4 r0_fh5 r0_fh6 r0_fh7 r0_fh8;
run;

data v_s3_fund_fh00_t_r1;
set v_s3_fund_fh00;
if _TYPE_ = 'T';
if _DEPVAR_ = 'glm_retrf';
r1_fh1 = fh1;
r1_fh2 = fh2;
r1_fh3 = fh3;
r1_fh4 = fh4;
r1_fh5 = fh5;
r1_fh6 = fh6;
r1_fh7 = fh7;
r1_fh8 = fh8;
keep aut_rank categ fundid_mer 
r1_fh1 r1_fh2 r1_fh3 r1_fh4 r1_fh5 r1_fh6 r1_fh7 r1_fh8;
run;

data v_s3_fund_fh00_t_r3;
set v_s3_fund_fh00;
if _TYPE_ = 'T';
if _DEPVAR_ = 's3_retrf';
r3_fh1 = fh1;
r3_fh2 = fh2;
r3_fh3 = fh3;
r3_fh4 = fh4;
r3_fh5 = fh5;
r3_fh6 = fh6;
r3_fh7 = fh7;
r3_fh8 = fh8;
keep aut_rank categ fundid_mer 
r3_fh1 r3_fh2 r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8;
run;

data v_s3_fund_fh00_t_all;
merge fund_fhalpha_t v_s3_fund_fh00_t_r0 v_s3_fund_fh00_t_r1 v_s3_fund_fh00_t_r3;
by aut_rank categ fundid_mer;
*alphas;
r0_fh_a_sig10 = 0; if r0_fh_a gt 1.65 then r0_fh_a_sig10 = 1;
r1_fh_a_sig10 = 0; if r1_fh_a gt 1.65 then r1_fh_a_sig10 = 1;
r3_fh_a_sig10 = 0; if r3_fh_a gt 1.65 then r3_fh_a_sig10 = 1;
*betas;
r0_fh1_sig10 = 0; if abs(r0_fh1) gt 1.65 then r0_fh1_sig10 = 1;
r0_fh2_sig10 = 0; if abs(r0_fh2) gt 1.65 then r0_fh2_sig10 = 1;
r0_fh3_sig10 = 0; if abs(r0_fh3) gt 1.65 then r0_fh3_sig10 = 1;
r0_fh4_sig10 = 0; if abs(r0_fh4) gt 1.65 then r0_fh4_sig10 = 1;
r0_fh5_sig10 = 0; if abs(r0_fh5) gt 1.65 then r0_fh5_sig10 = 1;
r0_fh6_sig10 = 0; if abs(r0_fh6) gt 1.65 then r0_fh6_sig10 = 1;
r0_fh7_sig10 = 0; if abs(r0_fh7) gt 1.65 then r0_fh7_sig10 = 1;
r0_fh8_sig10 = 0; if abs(r0_fh8) gt 1.65 then r0_fh8_sig10 = 1;

r1_fh1_sig10 = 0; if abs(r1_fh1) gt 1.65 then r1_fh1_sig10 = 1;
r1_fh2_sig10 = 0; if abs(r1_fh2) gt 1.65 then r1_fh2_sig10 = 1;
r1_fh3_sig10 = 0; if abs(r1_fh3) gt 1.65 then r1_fh3_sig10 = 1;
r1_fh4_sig10 = 0; if abs(r1_fh4) gt 1.65 then r1_fh4_sig10 = 1;
r1_fh5_sig10 = 0; if abs(r1_fh5) gt 1.65 then r1_fh5_sig10 = 1;
r1_fh6_sig10 = 0; if abs(r1_fh6) gt 1.65 then r1_fh6_sig10 = 1;
r1_fh7_sig10 = 0; if abs(r1_fh7) gt 1.65 then r1_fh7_sig10 = 1;
r1_fh8_sig10 = 0; if abs(r1_fh8) gt 1.65 then r1_fh8_sig10 = 1;

r3_fh1_sig10 = 0; if abs(r3_fh1) gt 1.65 then r3_fh1_sig10 = 1;
r3_fh2_sig10 = 0; if abs(r3_fh2) gt 1.65 then r3_fh2_sig10 = 1;
r3_fh3_sig10 = 0; if abs(r3_fh3) gt 1.65 then r3_fh3_sig10 = 1;
r3_fh4_sig10 = 0; if abs(r3_fh4) gt 1.65 then r3_fh4_sig10 = 1;
r3_fh5_sig10 = 0; if abs(r3_fh5) gt 1.65 then r3_fh5_sig10 = 1;
r3_fh6_sig10 = 0; if abs(r3_fh6) gt 1.65 then r3_fh6_sig10 = 1;
r3_fh7_sig10 = 0; if abs(r3_fh7) gt 1.65 then r3_fh7_sig10 = 1;
r3_fh8_sig10 = 0; if abs(r3_fh8) gt 1.65 then r3_fh8_sig10 = 1;
run;

*count # of significant alpha and betas;
proc sort data = v_s3_fund_fh00_t_all; by aut_rank categ fundid_mer; run;

proc summary data = v_s3_fund_fh00_t_all;
var r0_fh_a_sig10 r1_fh_a_sig10 r3_fh_a_sig10
r0_fh1_sig10 r1_fh1_sig10 r3_fh1_sig10
r0_fh2_sig10 r1_fh2_sig10 r3_fh2_sig10
r0_fh3_sig10 r1_fh3_sig10 r3_fh3_sig10
r0_fh4_sig10 r1_fh4_sig10 r3_fh4_sig10
r0_fh5_sig10 r1_fh5_sig10 r3_fh5_sig10
r0_fh6_sig10 r1_fh6_sig10 r3_fh6_sig10
r0_fh7_sig10 r1_fh7_sig10 r3_fh7_sig10
r0_fh8_sig10 r1_fh8_sig10 r3_fh8_sig10;
output out = s3_fund_fh00_coef_all_sig10
mean = r0_fh_a r1_fh_a r3_fh_a
r0_fh1 r1_fh1 r3_fh1
r0_fh2 r1_fh2 r3_fh2
r0_fh3 r1_fh3 r3_fh3
r0_fh4 r1_fh4 r3_fh4
r0_fh5 r1_fh5 r3_fh5
r0_fh6 r1_fh6 r3_fh6
r0_fh7 r1_fh7 r3_fh7
r0_fh8 r1_fh8 r3_fh8;
by aut_rank categ;
run;

data s3_fund_fh00_coef_all_sig10; 
set s3_fund_fh00_coef_all_sig10; 
stat = 'wsig10'; 
drop _TYPE_ _FREQ_; 
run;

*Put all together for HF_Betas table;
*These results are summarized in Figure 2;

data table_hf_betas;
set s3_fund_fh00_coef_all_av s3_fund_fh00_coef_all_t s3_fund_fh00_coef_all_sig10;
run;

proc sort data = table_hf_betas; by aut_rank stat; run;


/*********************************************************************************************/
/***************************** R^2 TABLE OUTPUT *************************************/

* R2 from regressions;
data v_s3_fund_fh00_rsq_r0;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
r0_rsq = _RSQ_;
keep aut_rank categ fundid_mer r0_rsq;
run;

data v_s3_fund_fh00_rsq_r1;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
r1_rsq = _RSQ_;
keep aut_rank categ fundid_mer r1_rsq;
run;

data v_s3_fund_fh00_rsq_r3;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
r3_rsq = _RSQ_;
keep aut_rank categ fundid_mer r3_rsq;
run;

data v_s3_fund_fh00_rsq_all;
merge v_s3_fund_fh00_rsq_r0 v_s3_fund_fh00_rsq_r1 v_s3_fund_fh00_rsq_r3;
by aut_rank categ fundid_mer;
run;

*take differences;
data v_s3_fund_fh00_rsq_all;
set v_s3_fund_fh00_rsq_all;
r1r0_rsq = r1_rsq - r0_rsq;
r3r0_rsq = r3_rsq - r0_rsq; 
r3r1_rsq = r3_rsq - r1_rsq; 
run;

*now do average and t-stat;
*average R2;
proc summary data = v_s3_fund_fh00_rsq_all;
var r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
output out = v_s3_fund_fh00_rsq_all_av
mean = r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
by aut_rank categ;
run;
data v_s3_fund_fh00_rsq_all_av; 
set v_s3_fund_fh00_rsq_all_av; 
stat = 'average'; 
drop _TYPE_ _FREQ_; 
run;
*t-stat for average R2;
proc summary data = v_s3_fund_fh00_rsq_all;
var r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
output out = v_s3_fund_fh00_rsq_all_t
t = r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
by aut_rank categ;
run;
data v_s3_fund_fh00_rsq_all_t; 
set v_s3_fund_fh00_rsq_all_t; 
stat = 'tstat'; 
drop _TYPE_ _FREQ_; 
run;

*set together;
data table_r2_part1;
set v_s3_fund_fh00_rsq_all_av v_s3_fund_fh00_rsq_all_t;
run;
proc sort data = table_r2_part1; by aut_rank stat; run;


*Decomposition of R2 (figure 6);
*Need covariance of fund return with each factor, and the fund return variance;

proc sort data = s3_00_select; by fundid_mer; run;
proc corr data = s3_00_select outp = cov_ret_fh cov noprint;
var retrf glm_retrf s3_retrf;  
with fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8;
by fundid_mer;
run;

*organize data: keep covariances;

data cov_ret_fh_cov;
set cov_ret_fh;
if _TYPE_ = 'COV';
xvar = _NAME_;
keep fundid_mer xvar retrf glm_retrf s3_retrf;
run;

proc transpose data = cov_ret_fh_cov out = cov_ret_fh_cov_r0 prefix = cov_r0_;
var retrf;
id xvar;
by fundid_mer;
run;

proc transpose data = cov_ret_fh_cov out = cov_ret_fh_cov_r1 prefix = cov_r1_;
var glm_retrf;
id xvar;
by fundid_mer;
run;

proc transpose data = cov_ret_fh_cov out = cov_ret_fh_cov_r3 prefix = cov_r3_;
var s3_retrf;
id xvar;
by fundid_mer;
run;

data cov_ret_fh_cov_r0; set cov_ret_fh_cov_r0; drop _NAME_; run;
data cov_ret_fh_cov_r1; set cov_ret_fh_cov_r1; drop _NAME_; run;
data cov_ret_fh_cov_r3; set cov_ret_fh_cov_r3; drop _NAME_; run;

*Variance of returns (Y);
proc sort data = s3_00_select; by fundid_mer; run;
proc summary data = s3_00_select noprint;
var retrf glm_retrf s3_retrf;  
output out = var_ret var = var_r0 var_r1 var_r3;
by fundid_mer;
run;
data var_ret; set var_ret; drop _TYPE_ _FREQ_; run;

*Merge variances and covariances to betas;

proc sort data = v_s3_fund_fh00_coef_all; by fundid_mer; run;
proc sort data = var_ret; by fundid_mer; run;
proc sort data = cov_ret_fh_cov_r0; by fundid_mer; run;
proc sort data = cov_ret_fh_cov_r1; by fundid_mer; run;
proc sort data = cov_ret_fh_cov_r3; by fundid_mer; run;

data v_table_r2_part2;
merge v_s3_fund_fh00_coef_all var_ret cov_ret_fh_cov_r0 cov_ret_fh_cov_r1 cov_ret_fh_cov_r3; 
by fundid_mer; 
run;

data v_table_r2_part2; *calculate contribution to R2;
set v_table_r2_part2;
*ret;
rsq_r0_fh1 = r0_fh1*cov_r0_fh1/var_r0;
rsq_r0_fh2 = r0_fh2*cov_r0_fh2/var_r0;
rsq_r0_fh3 = r0_fh3*cov_r0_fh3/var_r0;
rsq_r0_fh4 = r0_fh4*cov_r0_fh4/var_r0;
rsq_r0_fh5 = r0_fh5*cov_r0_fh5/var_r0;
rsq_r0_fh6 = r0_fh6*cov_r0_fh6/var_r0;
rsq_r0_fh7 = r0_fh7*cov_r0_fh7/var_r0;
rsq_r0_fh8 = r0_fh8*cov_r0_fh8/var_r0;
*glm ret;
rsq_r1_fh1 = r1_fh1*cov_r1_fh1/var_r1;
rsq_r1_fh2 = r1_fh2*cov_r1_fh2/var_r1;
rsq_r1_fh3 = r1_fh3*cov_r1_fh3/var_r1;
rsq_r1_fh4 = r1_fh4*cov_r1_fh4/var_r1;
rsq_r1_fh5 = r1_fh5*cov_r1_fh5/var_r1;
rsq_r1_fh6 = r1_fh6*cov_r1_fh6/var_r1;
rsq_r1_fh7 = r1_fh7*cov_r1_fh7/var_r1;
rsq_r1_fh8 = r1_fh8*cov_r1_fh8/var_r1;
*s3 ret;
rsq_r3_fh1 = r3_fh1*cov_r3_fh1/var_r3;
rsq_r3_fh2 = r3_fh2*cov_r3_fh2/var_r3;
rsq_r3_fh3 = r3_fh3*cov_r3_fh3/var_r3;
rsq_r3_fh4 = r3_fh4*cov_r3_fh4/var_r3;
rsq_r3_fh5 = r3_fh5*cov_r3_fh5/var_r3;
rsq_r3_fh6 = r3_fh6*cov_r3_fh6/var_r3;
rsq_r3_fh7 = r3_fh7*cov_r3_fh7/var_r3;
rsq_r3_fh8 = r3_fh8*cov_r3_fh8/var_r3;
*differences;
rsq_r1r0_fh1 = rsq_r1_fh1 - rsq_r0_fh1;
rsq_r1r0_fh2 = rsq_r1_fh2 - rsq_r0_fh2;
rsq_r1r0_fh3 = rsq_r1_fh3 - rsq_r0_fh3;
rsq_r1r0_fh4 = rsq_r1_fh4 - rsq_r0_fh4;
rsq_r1r0_fh5 = rsq_r1_fh5 - rsq_r0_fh5;
rsq_r1r0_fh6 = rsq_r1_fh6 - rsq_r0_fh6;
rsq_r1r0_fh7 = rsq_r1_fh7 - rsq_r0_fh7;
rsq_r1r0_fh8 = rsq_r1_fh8 - rsq_r0_fh8;

rsq_r3r0_fh1 = rsq_r3_fh1 - rsq_r0_fh1;
rsq_r3r0_fh2 = rsq_r3_fh2 - rsq_r0_fh2;
rsq_r3r0_fh3 = rsq_r3_fh3 - rsq_r0_fh3;
rsq_r3r0_fh4 = rsq_r3_fh4 - rsq_r0_fh4;
rsq_r3r0_fh5 = rsq_r3_fh5 - rsq_r0_fh5;
rsq_r3r0_fh6 = rsq_r3_fh6 - rsq_r0_fh6;
rsq_r3r0_fh7 = rsq_r3_fh7 - rsq_r0_fh7;
rsq_r3r0_fh8 = rsq_r3_fh8 - rsq_r0_fh8;
 
rsq_r3r1_fh1 = rsq_r3_fh1 - rsq_r1_fh1;
rsq_r3r1_fh2 = rsq_r3_fh2 - rsq_r1_fh2;
rsq_r3r1_fh3 = rsq_r3_fh3 - rsq_r1_fh3;
rsq_r3r1_fh4 = rsq_r3_fh4 - rsq_r1_fh4;
rsq_r3r1_fh5 = rsq_r3_fh5 - rsq_r1_fh5;
rsq_r3r1_fh6 = rsq_r3_fh6 - rsq_r1_fh6;
rsq_r3r1_fh7 = rsq_r3_fh7 - rsq_r1_fh7;
rsq_r3r1_fh8 = rsq_r3_fh8 - rsq_r1_fh8;
run;

*Average and t-stat by category;
proc sort data = v_table_r2_part2; by aut_rank categ; run;
*average;
proc summary data = v_table_r2_part2;
var
rsq_r0_fh1 rsq_r1_fh1 rsq_r3_fh1 rsq_r1r0_fh1 rsq_r3r0_fh1 rsq_r3r1_fh1
rsq_r0_fh2 rsq_r1_fh2 rsq_r3_fh2 rsq_r1r0_fh2 rsq_r3r0_fh2 rsq_r3r1_fh2
rsq_r0_fh3 rsq_r1_fh3 rsq_r3_fh3 rsq_r1r0_fh3 rsq_r3r0_fh3 rsq_r3r1_fh3
rsq_r0_fh4 rsq_r1_fh4 rsq_r3_fh4 rsq_r1r0_fh4 rsq_r3r0_fh4 rsq_r3r1_fh4
rsq_r0_fh5 rsq_r1_fh5 rsq_r3_fh5 rsq_r1r0_fh5 rsq_r3r0_fh5 rsq_r3r1_fh5
rsq_r0_fh6 rsq_r1_fh6 rsq_r3_fh6 rsq_r1r0_fh6 rsq_r3r0_fh6 rsq_r3r1_fh6
rsq_r0_fh7 rsq_r1_fh7 rsq_r3_fh7 rsq_r1r0_fh7 rsq_r3r0_fh7 rsq_r3r1_fh7
rsq_r0_fh8 rsq_r1_fh8 rsq_r3_fh8 rsq_r1r0_fh8 rsq_r3r0_fh8 rsq_r3r1_fh8;
output out = v_table_r2_part2_av
mean =
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
by aut_rank categ;
run;

data v_table_r2_part2_av; 
set v_table_r2_part2_av; 
stat = 'average'; 
drop _TYPE_ _FREQ_; 
run;

*t-stat;
proc summary data = v_table_r2_part2;
var
rsq_r0_fh1 rsq_r1_fh1 rsq_r3_fh1 rsq_r1r0_fh1 rsq_r3r0_fh1 rsq_r3r1_fh1
rsq_r0_fh2 rsq_r1_fh2 rsq_r3_fh2 rsq_r1r0_fh2 rsq_r3r0_fh2 rsq_r3r1_fh2
rsq_r0_fh3 rsq_r1_fh3 rsq_r3_fh3 rsq_r1r0_fh3 rsq_r3r0_fh3 rsq_r3r1_fh3
rsq_r0_fh4 rsq_r1_fh4 rsq_r3_fh4 rsq_r1r0_fh4 rsq_r3r0_fh4 rsq_r3r1_fh4
rsq_r0_fh5 rsq_r1_fh5 rsq_r3_fh5 rsq_r1r0_fh5 rsq_r3r0_fh5 rsq_r3r1_fh5
rsq_r0_fh6 rsq_r1_fh6 rsq_r3_fh6 rsq_r1r0_fh6 rsq_r3r0_fh6 rsq_r3r1_fh6
rsq_r0_fh7 rsq_r1_fh7 rsq_r3_fh7 rsq_r1r0_fh7 rsq_r3r0_fh7 rsq_r3r1_fh7
rsq_r0_fh8 rsq_r1_fh8 rsq_r3_fh8 rsq_r1r0_fh8 rsq_r3r0_fh8 rsq_r3r1_fh8;
output out = v_table_r2_part2_t
t =
r0_fh1 r1_fh1 r3_fh1 r1r0_fh1 r3r0_fh1 r3r1_fh1
r0_fh2 r1_fh2 r3_fh2 r1r0_fh2 r3r0_fh2 r3r1_fh2
r0_fh3 r1_fh3 r3_fh3 r1r0_fh3 r3r0_fh3 r3r1_fh3
r0_fh4 r1_fh4 r3_fh4 r1r0_fh4 r3r0_fh4 r3r1_fh4
r0_fh5 r1_fh5 r3_fh5 r1r0_fh5 r3r0_fh5 r3r1_fh5
r0_fh6 r1_fh6 r3_fh6 r1r0_fh6 r3r0_fh6 r3r1_fh6
r0_fh7 r1_fh7 r3_fh7 r1r0_fh7 r3r0_fh7 r3r1_fh7
r0_fh8 r1_fh8 r3_fh8 r1r0_fh8 r3r0_fh8 r3r1_fh8;
by aut_rank categ;
run;

data v_table_r2_part2_t; 
set v_table_r2_part2_t; 
stat = 'tstat'; 
drop _TYPE_ _FREQ_; 
run;

*set together and then copy to output;

data table_r2_part2;
set v_table_r2_part2_av v_table_r2_part2_t;
run;

proc sort data = table_r2_part2; by aut_rank stat; run;

*set table R2 all together;

data table_r2_part12;
merge table_r2_part1 table_r2_part2;
by aut_rank stat; 
run;


/**********************************************************************************/
/***************************   OVERALL STATISTICS   *******************************/
**************** AT FUND LEVEL - AVERAGE AT CATEGORY LEVEL;

***calculate average alpha and average t by fund;
proc sort data = v_rets_betas_gt12mm; by aut_rank categ fundid_mer; run;
proc summary data = v_rets_betas_gt12mm; 
var retrf r0_fh_a glm_retrf r1_fh_a s3_retrf r3_fh_a;
output out = v_mean_rets_alphas
mean = r0 r0_fh_a r1 r1_fh_a r3 r3_fh_a
t = r0_t r0_fh_a_t r1_t r1_fh_a_t r3_t r3_fh_a_t
std = r0_std r0_fh_a_std r1_std r1_fh_a_std r3_std r3_fh_a_std;
by aut_rank categ fundid_mer; 
run;

proc summary data = v_mean_rets_alphas; *average by category;
var r0 r0_fh_a r1 r1_fh_a r3 r3_fh_a
r0_t r0_fh_a_t r1_t r1_fh_a_t r3_t r3_fh_a_t
r0_std r0_fh_a_std r1_std r1_fh_a_std r3_std r3_fh_a_std;
output out = v_mean_rets_alphas_stats
mean = /autoname;
by aut_rank categ; 
run;

*Data from other tables;
*RSQ;
data w_table_r2_part1_au;
set table_r2_part1;
if stat = 'average';
keep aut_rank r0_rsq r1_rsq r3_rsq;
run;

*% of significant alphas;
data w_table_hf_betas_au;
set table_hf_betas;
if stat = 'wsig10';
r0_fh_a_sig10 = r0_fh_a;
r1_fh_a_sig10 = r1_fh_a;
r3_fh_a_sig10 = r3_fh_a;
keep aut_rank r0_fh_a_sig10 r1_fh_a_sig10 r3_fh_a_sig10;
run;

proc sort data = v_mean_rets_alphas_stats; by aut_rank; run;
proc sort data = w_table_r2_part1_au; by aut_rank; run;
proc sort data = w_table_hf_betas_au; by aut_rank; run;

*organize data for table;
data tab_mean_rets_alphas_stats; *organize data for table;
merge v_mean_rets_alphas_stats w_table_r2_part1_au w_table_hf_betas_au;
by aut_rank;
r0_avg = r0_mean;
r0_std = r0_std_mean;
r0_sharpe = r0_avg/r0_std;
r0_rsq_av = r0_rsq;
r0_fh_a_avg = r0_fh_a_mean;
r0_expl = 1 - (r0_fh_a_avg/r0_avg);
r0_fh_a_tstat = r0_fh_a_t_mean;
r0_fh_a_perc_sig10 = r0_fh_a_sig10;
r1_avg = r1_mean;
r1_std = r1_std_mean;
r1_sharpe = r1_avg/r1_std;
r1_rsq_av = r1_rsq;
r1_fh_a_avg = r1_fh_a_mean;
r1_expl = 1 - (r1_fh_a_avg/r1_avg);
r1_fh_a_tstat = r1_fh_a_t_mean;
r1_fh_a_perc_sig10 = r1_fh_a_sig10;
r3_avg = r3_mean;
r3_std = r3_std_mean;
r3_sharpe = r3_avg/r3_std;
r3_rsq_av = r3_rsq;
r3_fh_a_avg = r3_fh_a_mean;
r3_expl = 1 - (r3_fh_a_avg/r3_avg);
r3_fh_a_tstat = r3_fh_a_t_mean;
r3_fh_a_perc_sig10 = r3_fh_a_sig10;
keep aut_rank categ 
r0_avg r0_std r0_sharpe r0_rsq_av r0_fh_a_avg r0_expl r0_fh_a_tstat r0_fh_a_perc_sig10
r1_avg r1_std r1_sharpe r1_rsq_av r1_fh_a_avg r1_expl r1_fh_a_tstat r1_fh_a_perc_sig10
r3_avg r3_std r3_sharpe r3_rsq_av r3_fh_a_avg r3_expl r3_fh_a_tstat r3_fh_a_perc_sig10;
run;


*********** Now again fund-level statistics, but group by strategy liquidity;

*make liquidity groups in file v_mean_rets_alphas;

data v_mean_rets_alphas;
set v_mean_rets_alphas;
strat_il_gr = 'High';
if aut_rank gt 3 then strat_il_gr = 'Mid';
if aut_rank gt 8 then strat_il_gr = 'Low';
run;

proc sort data = v_mean_rets_alphas; by strat_il_gr; run;
proc summary data = v_mean_rets_alphas; *average by category;
var r0 r0_fh_a r1 r1_fh_a r3 r3_fh_a
r0_t r0_fh_a_t r1_t r1_fh_a_t r3_t r3_fh_a_t
r0_std r0_fh_a_std r1_std r1_fh_a_std r3_std r3_fh_a_std;
output out = v_mean_rets_alphas_liq_gr
mean = /autoname;
by strat_il_gr; 
run;

*Data from other tables;
*RSQ; *average R2;

data v_s3_fund_fh00_rsq_all;
set v_s3_fund_fh00_rsq_all;
strat_il_gr = 'High';
if aut_rank gt 3 then strat_il_gr = 'Mid';
if aut_rank gt 8 then strat_il_gr = 'Low';
run;

proc sort data = v_s3_fund_fh00_rsq_all; by strat_il_gr; run;
proc summary data = v_s3_fund_fh00_rsq_all;
var r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
output out = v_s3_fund_fh00_rsq_liq_gr
mean = r0_rsq r1_rsq r3_rsq r1r0_rsq r3r0_rsq r3r1_rsq;
by strat_il_gr;
run;

proc sort data = v_mean_rets_alphas_liq_gr; by strat_il_gr; run;
proc sort data = v_s3_fund_fh00_rsq_liq_gr; by strat_il_gr; run;

*organize data for table;
data tab_mean_rets_alphas_liq_gr; *organize data for table;
merge v_mean_rets_alphas_liq_gr v_s3_fund_fh00_rsq_liq_gr;
by strat_il_gr;
r0_avg = r0_mean;
r0_std = r0_std_mean;
r0_sharpe = r0_avg/r0_std;
r0_rsq_av = r0_rsq;
r0_fh_a_avg = r0_fh_a_mean;
r0_expl = 1 - (r0_fh_a_avg/r0_avg);
r0_fh_a_tstat = r0_fh_a_t_mean;
r0_fh_a_perc_sig10 = -99;
r1_avg = r1_mean;
r1_std = r1_std_mean;
r1_sharpe = r1_avg/r1_std;
r1_rsq_av = r1_rsq;
r1_fh_a_avg = r1_fh_a_mean;
r1_expl = 1 - (r1_fh_a_avg/r1_avg);
r1_fh_a_tstat = r1_fh_a_t_mean;
r1_fh_a_perc_sig10 = -99;
r3_avg = r3_mean;
r3_std = r3_std_mean;
r3_sharpe = r3_avg/r3_std;
r3_rsq_av = r3_rsq;
r3_fh_a_avg = r3_fh_a_mean;
r3_expl = 1 - (r3_fh_a_avg/r3_avg);
r3_fh_a_tstat = r3_fh_a_t_mean;
r3_fh_a_perc_sig10 = -99;
strat_liq_rank = 3;
if strat_il_gr = 'Mid' then strat_liq_rank = 2;
if strat_il_gr = 'High' then strat_liq_rank = 1;
keep strat_il_gr strat_liq_rank
r0_avg r0_std r0_sharpe r0_rsq_av r0_fh_a_avg r0_expl r0_fh_a_tstat r0_fh_a_perc_sig10
r1_avg r1_std r1_sharpe r1_rsq_av r1_fh_a_avg r1_expl r1_fh_a_tstat r1_fh_a_perc_sig10
r3_avg r3_std r3_sharpe r3_rsq_av r3_fh_a_avg r3_expl r3_fh_a_tstat r3_fh_a_perc_sig10;
run;

proc sort data = tab_mean_rets_alphas_liq_gr; by strat_liq_rank; run;


/***************************   OVERALL STATISTICS   *******************************/
********************************* AT STRATEGY LEVEL;

*Aggregate at strategy level;
proc sort data = v_rets_betas_gt12mm; by aut_rank categ yyyymm; run;
proc summary data = v_rets_betas_gt12mm; 
var retrf glm_retrf s3_retrf fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8;
output out = categ_ew_ret 
mean = r0_ew r1_ew r3_ew fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8;
by aut_rank categ yyyymm; 
run;

data categ_ew_ret; set categ_ew_ret; drop _TYPE_ _FREQ_; run;

*Summary stats for category returns;
proc summary data = categ_ew_ret; 
var r0_ew r1_ew r3_ew;
output out = categ_ew_ret_stats 
mean = std = /autoname;
by aut_rank categ; 
run;

*FH regression;
proc sort data = categ_ew_ret; by aut_rank categ; run;
proc reg data = categ_ew_ret outest = v_categ_ew_fhreg noprint tableout;
model r0_ew = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model r1_ew = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model r3_ew = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
by aut_rank categ; 
run; quit;

*Alpha, t-stat of alpha, p-value of alpha, and (adj) R^2;

*alpha;
data v_categ_ew_fhreg_a_av_r0; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r0_ew';
r0_a_mean = intercept; keep aut_rank r0_a_mean; run;

data v_categ_ew_fhreg_a_av_r1; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r1_ew';
r1_a_mean = intercept; keep aut_rank r1_a_mean; run;

data v_categ_ew_fhreg_a_av_r3; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r3_ew';
r3_a_mean = intercept; keep aut_rank r3_a_mean; run;

*alpha t-stat;
data v_categ_ew_fhreg_a_t_r0; set v_categ_ew_fhreg; 
if _TYPE_ = 'T'; if _DEPVAR_ = 'r0_ew';
r0_a_t = intercept; keep aut_rank r0_a_t; run;

data v_categ_ew_fhreg_a_t_r1; set v_categ_ew_fhreg; 
if _TYPE_ = 'T'; if _DEPVAR_ = 'r1_ew';
r1_a_t = intercept; keep aut_rank r1_a_t; run;

data v_categ_ew_fhreg_a_t_r3; set v_categ_ew_fhreg; 
if _TYPE_ = 'T'; if _DEPVAR_ = 'r3_ew';
r3_a_t = intercept; keep aut_rank r3_a_t; run;

*alpha p-value;
data v_categ_ew_fhreg_a_p_r0; set v_categ_ew_fhreg; 
if _TYPE_ = 'PVALUE'; if _DEPVAR_ = 'r0_ew';
r0_a_p = intercept; keep aut_rank r0_a_p; run;

data v_categ_ew_fhreg_a_p_r1; set v_categ_ew_fhreg; 
if _TYPE_ = 'PVALUE'; if _DEPVAR_ = 'r1_ew';
r1_a_p = intercept; keep aut_rank r1_a_p; run;

data v_categ_ew_fhreg_a_p_r3; set v_categ_ew_fhreg; 
if _TYPE_ = 'PVALUE'; if _DEPVAR_ = 'r3_ew';
r3_a_p = intercept; keep aut_rank r3_a_p; run;

*regression RSQ;
data v_categ_ew_fhreg_rsq_r0; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r0_ew';
r0_arsq = _ADJRSQ_; keep aut_rank r0_arsq; run;

data v_categ_ew_fhreg_rsq_r1; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r1_ew';
r1_arsq = _ADJRSQ_; keep aut_rank r1_arsq; run;

data v_categ_ew_fhreg_rsq_r3; set v_categ_ew_fhreg; 
if _TYPE_ = 'PARMS'; if _DEPVAR_ = 'r3_ew';
r3_arsq = _ADJRSQ_; keep aut_rank r3_arsq; run;

*Set all together;

data tab_categ_ew;
merge categ_ew_ret_stats 
v_categ_ew_fhreg_a_av_r0 v_categ_ew_fhreg_a_av_r1 v_categ_ew_fhreg_a_av_r3
v_categ_ew_fhreg_a_t_r0 v_categ_ew_fhreg_a_t_r1 v_categ_ew_fhreg_a_t_r3
v_categ_ew_fhreg_a_p_r0 v_categ_ew_fhreg_a_p_r1 v_categ_ew_fhreg_a_p_r3
v_categ_ew_fhreg_rsq_r0 v_categ_ew_fhreg_rsq_r1 v_categ_ew_fhreg_rsq_r3;
by aut_rank; 
drop _TYPE_ _FREQ_;
run;

*organize data for table;
data tab_categ_ew;
set tab_categ_ew;
r0_avg = r0_ew_mean;
r0_std = r0_ew_stddev;
r0_sharpe = r0_avg/r0_std;
r0_rsq = r0_arsq;
r0_fh_a = r0_a_mean;
r0_expl = 1 - (r0_fh_a/r0_avg);
r0_fh_a_tstat = r0_a_t;
r0_fh_a_p = r0_a_p;
r1_avg = r1_ew_mean;
r1_std = r1_ew_stddev;
r1_sharpe = r1_avg/r1_std;
r1_rsq = r1_arsq;
r1_fh_a = r1_a_mean;
r1_expl = 1 - (r1_fh_a/r1_avg);
r1_fh_a_tstat = r1_a_t;
r1_fh_a_p = r1_a_p;
r3_avg = r3_ew_mean;
r3_std = r3_ew_stddev;
r3_sharpe = r3_avg/r3_std;
r3_rsq = r3_arsq;
r3_fh_a = r3_a_mean;
r3_expl = 1 - (r3_fh_a/r3_avg);
r3_fh_a_tstat = r3_a_t;
r3_fh_a_p = r3_a_p;
keep aut_rank categ
r0_avg r0_std r0_sharpe r0_rsq r0_fh_a r0_expl r0_fh_a_tstat r0_fh_a_p
r1_avg r1_std r1_sharpe r1_rsq r1_fh_a r1_expl r1_fh_a_tstat r1_fh_a_p
r3_avg r3_std r3_sharpe r3_rsq r3_fh_a r3_expl r3_fh_a_tstat r3_fh_a_p;
run;







/****************************************************************************************************/
/****************************************************************************************************/
/****************************************************************************************************/
/****************************************************************************************************/
/****************************************************************************************************/
/****************************************************************************************************/

*** The following part of the code produces the results presented in Tables 2 and 3... ;
*** ...under the columns labeled "two group 3-step unsmoothing";
*** Robustness Test: Split into smaller category groups before running 3-step method;
****** Split each category into N goups;
****** Unsmooth funds in group n using index made up of funds in group n+1;
**Rename main dataset "hf04" for this part of the estimation;


%let ngr = 2; *how many groups;

data hf04;
set hf00;
yyyy = round(yyyymm/100,1);
keep ret yyyymm yyyy fundid_mer aut_rank categ 
av_ret_fundid dem_ret fund_seq;
run;

*Sort categories into N groups;
*Sort funds within year of entry;
proc sort data = hf04; by aut_rank fundid_mer; run;
proc summary data = hf04; 
var yyyy;
output out = v_hf04_minyyyy
min = fund_min_yyyy;
by aut_rank fundid_mer; run;

proc sort data = v_hf04_minyyyy; by aut_rank fund_min_yyyy; run;
data v_hf04_minyyyy;
set v_hf04_minyyyy;
by aut_rank fund_min_yyyy; 
if first.fund_min_yyyy then cat_yyyy_seq = 1;
else cat_yyyy_seq + 1;
run;

data v_hf04_minyyyy; *split categories in N groups;
set v_hf04_minyyyy;
group_n = mod(cat_yyyy_seq,&ngr);
if group_n = 0 then group_n = &ngr;
autrankn = aut_rank*100 + group_n;
drop _TYPE_;
run;

data v_new_group_list;
set v_hf04_minyyyy;
keep fundid_mer autrankn;
run;

*add new group identifier to fund dataset;
proc sort data = hf04; by fundid_mer; run;
proc sort data = v_new_group_list; by fundid_mer; run;

data hf04; 
merge hf04 v_new_group_list;
by fundid_mer; 
run;

*Now Apply 3-step Unsmoothing using Index;

proc sort data = hf04; by autrankn categ yyyymm; run;
proc summary data = hf04; 
var ret;
output out = hf04_ewret mean = ret_ew;
by autrankn categ yyyymm; 
run;

*FIRST STEP: AGGREGATE ECONOMIC (UNSMOOTHED) RETURNS;

data h4_s1ag00;
set hf04_ewret;
funds_categ_mm = _FREQ_;
keep autrankn categ yyyymm ret_ew funds_categ_mm;
run;

*demean;
proc sort data = h4_s1ag00; by autrankn; run;
proc summary data = h4_s1ag00; 
var ret_ew funds_categ_mm;
output out = w_h4_mean_ew_ret mean = av_aggrret_categ av_funds_categ_mm;
by autrankn; run;

data h4_s1ag00;
merge h4_s1ag00 w_h4_mean_ew_ret;
by autrankn;
dem_catret_ew = ret_ew - av_aggrret_categ;
drop _TYPE_ _FREQ_ funds_categ_mm av_funds_categ_mm;
run;

*category sequence #;
proc sort data = h4_s1ag00; by autrankn yyyymm; run;
data h4_s1ag00;
set h4_s1ag00;
by autrankn;
if first.autrankn then categ_seq = 1;
else categ_seq + 1;
run; 

*Apply MA unsmothing;
*MA with 3 lags;
proc sort data = h4_s1ag00; by autrankn yyyymm; run;
proc arima data= h4_s1ag00;
identify var = dem_catret_ew noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= h4_ag_arima_ma3_est OUTSTAT=h4_ag_arima_ma3_diag noprint;
*forecast noprint;
by autrankn;
run;
quit;

*Estimated PAIs for MA(3) case;
data h4_ag_arima_ma3_est;
set h4_ag_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_pai_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_pai_0 = 1/ma3_pai_sum;
ma3_pai_1 = -ma1_1/ma3_pai_sum;
ma3_pai_2 = -ma1_2/ma3_pai_sum;
ma3_pai_3 = -ma1_3/ma3_pai_sum;
ma3_pai_sum_norm = ma3_pai_0 + ma3_pai_1 + ma3_pai_2;
keep autrankn ma3_STATUS_ ma3_pai_0 ma3_pai_1 ma3_pai_2 ma3_pai_3 ma3_pai_sum;
run;

*add coefficients to main dataset;
proc sort data = h4_ag_arima_ma3_est; by autrankn; run;
proc sort data = h4_s1ag00; by autrankn; run;

data h4_s1ag01;
merge h4_s1ag00 h4_ag_arima_ma3_est;
by autrankn;
run;

*non-convergence;
data h4_s1ag01;
set h4_s1ag01;
ma3_gt_1_5 = 0; 
if ma3_pai_0 gt 1.25 or ma3_pai_1 gt 1.25 or ma3_pai_2 gt 1.25 or ma3_pai_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_pai_0 le -0.45 or ma3_pai_1 le -0.45 or ma3_pai_2 le -0.45 or ma3_pai_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_pai_0 = 1; sel_pai_1 = 0; sel_pai_2 = 0; sel_pai_3 = 0;
sel_pai_0 = ma3_pai_0;
sel_pai_1 = ma3_pai_1;
sel_pai_2 = ma3_pai_2;
sel_pai_3 = ma3_pai_3;
if ma3_gt_1_5 = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
if ma3_status = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
run;


*back out unsmoothed returns for aggregate returns;

proc sort data = h4_s1ag01; by autrankn yyyymm; run;

%macro back_out_catrets;

data h4_s1ag01_loop;
set h4_s1ag01;
backed_catret_aic = dem_catret_ew;
keep autrankn categ categ_seq backed_catret_aic dem_catret_ew yyyymm av_aggrret_categ
sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3;
run;

proc sort data = h4_s1ag01_loop; by autrankn yyyymm; run;

%do i = 4 %to 276; *MA3;

data h4_s1ag01_loop;
set h4_s1ag01_loop;
lag1_backed_catret_aic = lag1(backed_catret_aic);
lag2_backed_catret_aic = lag2(backed_catret_aic);
lag3_backed_catret_aic = lag3(backed_catret_aic);

if categ_seq = &i then backed_catret_aic = (dem_catret_ew - sel_pai_1*lag1_backed_catret_aic - sel_pai_2*lag2_backed_catret_aic - sel_pai_3*lag3_backed_catret_aic)/sel_pai_0;
run;

%end;

%mend back_out_catrets;
%back_out_catrets;


proc sort data = h4_s1ag01_loop; by autrankn; run;
proc summary data = h4_s1ag01_loop; 
var backed_catret_aic;
output out = check_h4_av_s1ag01_loop mean = /autoname;
by autrankn;
run;

*re-adjust mean;
data h4_s1ag01_loop; 
merge h4_s1ag01_loop check_h4_av_s1ag01_loop;
by autrankn; 
temp_backed_catret_aic = backed_catret_aic;
drop backed_catret_aic;
run;

data h4_s1ag01_loop; 
set h4_s1ag01_loop; 
backed_catret_aic = temp_backed_catret_aic - backed_catret_aic_mean;
drop backed_catret_aic_mean temp_backed_catret_aic _FREQ_ _TYPE_;
run;

proc sort data = h4_s1ag01_loop; by autrankn; run;
proc summary data = h4_s1ag01_loop; 
var backed_catret_aic;
output out = check_h4_av_s1ag01_loop_v2 mean = /autoname;
by autrankn;
run;

*Clean dataset and add back mean;
data h4_s1ag01_loop;
set h4_s1ag01_loop;
dem_backed_catret_aic = backed_catret_aic;
backed_catret_aic = backed_catret_aic + av_aggrret_categ;
keep autrankn yyyymm backed_catret_aic dem_backed_catret_aic;
run;

proc sort data = h4_s1ag01_loop; by autrankn yyyymm; run;
proc sort data = h4_s1ag01; by autrankn yyyymm; run;

data h4_s1ag02;
merge h4_s1ag01 h4_s1ag01_loop; 
by autrankn yyyymm; 
run;

************************************************************************;
* SECOND STEP: OBTAIN FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);

data h4_s2fund00; *first get fund-level data;
set hf04;
keep ret yyyymm fundid_mer categ fund_seq av_ret_fundid autrankn;
run;

*KEY CHANGE: for the "multiple group" robustness test, rename index n as index n+1, etc;
data w_h4_au_ewcatert; 
set h4_s1ag02;
catret_ew = ret_ew;
ag_aic_adj1_win = ag_aic_adj1_win;
length index_n 8;
orig_autrankn = autrankn;
index_n = substr(orig_autrankn, length(orig_autrankn));
orig_aut_rank = round(autrankn/100,1);
new_autrankn = orig_aut_rank*100 + index_n + 1;
if index_n = &ngr then new_autrankn = orig_aut_rank*100 + 1; *if n = N, then unsmooth with respect to n = 1;
drop autrankn;
run;

data w_h4_au_ewcatert; *clean;
set w_h4_au_ewcatert;
autrankn = new_autrankn;
keep autrankn yyyymm catret_ew backed_catret_aic ag_aic_adj1_win orig_autrankn;
run;

proc sort data = w_h4_au_ewcatert; by autrankn yyyymm; run;
data w_h4_au_ewcatert;
set w_h4_au_ewcatert;
*1 lag;
lag1_backed_catret_aic = lag1(backed_catret_aic);
if autrankn ne lag1(autrankn) then lag1_backed_catret_aic = . ;
*2 lag;
lag2_backed_catret_aic = lag2(backed_catret_aic);
if autrankn ne lag2(autrankn) then lag2_backed_catret_aic = . ;
*3 lag;
lag3_backed_catret_aic = lag3(backed_catret_aic);
if autrankn ne lag3(autrankn) then lag3_backed_catret_aic = . ;
run;

*attach and demean;

proc sort data = h4_s2fund00; by autrankn yyyymm; run;
proc sort data = w_h4_au_ewcatert; by autrankn yyyymm; run;

data h4_s2fund00;
merge h4_s2fund00 w_h4_au_ewcatert; 
by autrankn yyyymm; 
ret_excat = ret - catret_ew;
run;

*Obtain fund average return in excess of the category return;
*demean unsmoothed EW categ return;

proc sort data = h4_s2fund00; by fundid_mer; run;
proc summary data = h4_s2fund00; 
var ret_excat backed_catret_aic
lag1_backed_catret_aic
lag2_backed_catret_aic
lag3_backed_catret_aic;
output out = w_h4_mean_excat_ret 
mean = av_retexcat_fundid av_backed_catret_aic_fund
av_lag1_backed_catret_aic_fund
av_lag2_backed_catret_aic_fund
av_lag3_backed_catret_aic_fund;
by fundid_mer; 
run;

data h4_s2fund00;
merge h4_s2fund00 w_h4_mean_excat_ret;
by fundid_mer; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_catret_aic = backed_catret_aic - av_backed_catret_aic_fund;
dem_lag1_backed_catret_aic = lag1_backed_catret_aic - av_lag1_backed_catret_aic_fund;
dem_lag2_backed_catret_aic = lag2_backed_catret_aic - av_lag2_backed_catret_aic_fund;
dem_lag3_backed_catret_aic = lag3_backed_catret_aic - av_lag3_backed_catret_aic_fund;
run;


*MA with 3 lags;
proc sort data = h4_s2fund00; by fundid_mer yyyymm; run;
proc arima data= h4_s2fund00;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= h4_excat_arima_ma3_est OUTSTAT= h4_excat_arima_ma3_diag noprint;
*forecast noprint;
by fundid_mer;
run;
quit;

*MA3;
data h4_excat_arima_ma3_est;
set h4_excat_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_phi_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_phi_0 = 1/ma3_phi_sum;
ma3_phi_1 = -ma1_1/ma3_phi_sum;
ma3_phi_2 = -ma1_2/ma3_phi_sum;
ma3_phi_3 = -ma1_3/ma3_phi_sum;
ma3_phi_sum_norm = ma3_phi_0 + ma3_phi_1 + ma3_phi_2;
keep fundid_mer ma3_STATUS_ ma3_phi_0 ma3_phi_1 ma3_phi_2 ma3_phi_3 ma3_phi_sum;
run;

*attach thetas to main dataset;
proc sort data = h4_s2fund00; by fundid_mer; run;
proc sort data = h4_excat_arima_ma3_est; by fundid_mer; run;

data h4_s2fund01;
merge h4_s2fund00 h4_excat_arima_ma3_est;
by fundid_mer;
run;

data h4_s2fund01;
set h4_s2fund01;
ma3_gt_1_5 = 0; 
if ma3_phi_0 gt 1.25 or ma3_phi_1 gt 1.25 or ma3_phi_2 gt 1.25 or ma3_phi_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_phi_0 le -0.45 or ma3_phi_1 le -0.45 or ma3_phi_2 le -0.45 or ma3_phi_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_phi_0 = ma3_phi_0; 
sel_phi_1 = ma3_phi_1; 
sel_phi_2 = ma3_phi_2; 
sel_phi_3 = ma3_phi_3;
if ma3_gt_1_5 = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
if ma3_status = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
run;


*back out unsmoothed excess returns;
proc sort data = h4_s2fund01; by fundid_mer yyyymm; run;

%macro back_out_exrets;

data h4_s2fund01_loop;
set h4_s2fund01;
backed_ret_excat_aic = dem_ret_excat;
keep backed_ret_excat_aic dem_ret_excat fundid_mer yyyymm fund_seq av_ret_fundid
sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3 av_retexcat_fundid;
run;

proc sort data = h4_s2fund01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *AIC;

data h4_s2fund01_loop;
set h4_s2fund01_loop;
lag1_backed_ret_excat_aic = lag1(backed_ret_excat_aic);
lag2_backed_ret_excat_aic = lag2(backed_ret_excat_aic);
lag3_backed_ret_excat_aic = lag3(backed_ret_excat_aic);

if fund_seq = &i then backed_ret_excat_aic = (dem_ret_excat - sel_phi_1*lag1_backed_ret_excat_aic - sel_phi_2*lag2_backed_ret_excat_aic - sel_phi_3*lag3_backed_ret_excat_aic)/sel_phi_0;
run;

%end;

%mend back_out_exrets;
%back_out_exrets;

*clean;
data h4_s2fund01_loop;
set h4_s2fund01_loop;
res_backed_ret_excat_aic = backed_ret_excat_aic;
keep fundid_mer yyyymm res_backed_ret_excat_aic;
run;

proc sort data = h4_s2fund01_loop; by fundid_mer yyyymm; run;
proc sort data = h4_s2fund01; by fundid_mer yyyymm; run;

data h4_s2fund02;
merge h4_s2fund01 h4_s2fund01_loop; 
by fundid_mer yyyymm; 
run;

* STEP 3 : Add up unmoothed residual from step 1 and step 2;

*aggregate residual;
data h4_s1_resid;
set h4_s1ag02;
keep autrankn categ yyyymm av_aggrret_categ 
backed_catret_aic dem_backed_catret_aic;
run;

*fund-level cat-excess return residual;
data h4_s2_resid;
set h4_s2fund02;
keep fundid_mer yyyymm dem_ret_excat res_backed_ret_excat_aic;
run;

*attach to main dataset;
proc sort data = hf04; by autrankn yyyymm; run;
proc sort data = h4_s1_resid; by autrankn yyyymm; run;

data h4_s3_00;
merge hf04 h4_s1_resid; 
by autrankn yyyymm; 
run;

proc sort data = h4_s3_00; by fundid_mer yyyymm; run;
proc sort data = h4_s2_resid; by fundid_mer yyyymm; run;

data h4_s3_00;
merge h4_s3_00 h4_s2_resid; 
by fundid_mer yyyymm;
run;

data h4_s3_00;
set h4_s3_00;
s3_uns_ret_aic_temp = dem_backed_catret_aic + res_backed_ret_excat_aic + av_ret_fundid;
run;

*adjust mean;
proc sort data = h4_s3_00; by fundid_mer; run;
proc summary data = h4_s3_00; 
var s3_uns_ret_aic_temp av_ret_fundid;
output out = w_h4_adj_s3_uns mean = /autoname;
by fundid_mer; 
run;

data h4_s3_00;
merge h4_s3_00 w_h4_adj_s3_uns;
by fundid_mer; 
s3_uns_ret_aic = s3_uns_ret_aic_temp - s3_uns_ret_aic_temp_mean + av_ret_fundid_mean;
drop s3_uns_ret_aic_temp_mean av_ret_fundid_mean _TYPE_ _FREQ_;
run;

***************************** PROCEED TO ANALYSIS OF UNSMOOTHED DATA;

data h4_s3_00a; 
set h4_s3_00; 
strat_il_gr = 'High';
if aut_rank gt 3 then strat_il_gr = 'Mid';
if aut_rank gt 8 then strat_il_gr = 'Low';
s3_ret = s3_uns_ret_aic; *choose s3_uns_ret_aic or s3_uns_ret_fix;
keep fundid_mer yyyymm categ aut_rank autrankn
strat_il_gr fund_seq s3_ret;
run;

*attach imputed add date;
data tass_imputed_add_date1; 
set hfautoc.tass_imputed_add_date; 
fundid_mer = fundid*100 + 1;
keep fundid_mer imputed_add_date;
run;

data bh_imputed_add_date1; 
set hfautoc.bh_imputed_add_date; 
fundid_mer = fund_id*100 + 2;
keep fundid_mer imputed_add_date;
run;

data imputed_add_dates; 
set tass_imputed_add_date1 bh_imputed_add_date1; 
run;

proc sort data = h4_s3_00a; by fundid_mer; run;
proc sort data = imputed_add_dates; by fundid_mer; run;

data h4_s3_00a; 
merge h4_s3_00a(in=a) imputed_add_dates;
by fundid_mer; 
if a;
run;

*Read HF factors;
data hffact;
set hfautoc.hf_factors;
fh1 = sp500_rf;
fh2 = size_spread;
fh3 = emerg_mkt_rf;
fh4 = FS_bond_mkt;
fh5 = FS_credit_sprd;
fh6 = PTFSBD;
fh7 = PTFSFX;
fh8 = PTFSCOM;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 rf;
run;

*merge;
proc sort data = h4_s3_00a; by yyyymm; run;
proc sort data = hffact; by yyyymm; run;

data h4_s3_00a; 
merge h4_s3_00a(in=a) hffact;
by yyyymm; 
if a;
run;

data h4_s3_00a; 
set h4_s3_00a; 
s3_retrf = s3_ret - rf;
run;

***************** Autocorrelation Table;
*up to 4 lags;

data h4_lagret_fund;
set h4_s3_00a;
keep yyyymm fundid_mer categ fund_seq aut_rank autrankn strat_il_gr s3_ret;
run;

****************************;
**** S3 RET;

%let f_rets = s3_ret;

data h4_lagret_fund_select_temp; 
set h4_lagret_fund; 
rets = &f_rets;
run;

proc sort data = h4_lagret_fund_select_temp; by fundid_mer yyyymm; run;

data h4_lagret_fund_select_temp;
set h4_lagret_fund_select_temp;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data h4_lagret_fund_select; set h4_lagret_fund_select_temp; if fund_seq gt 4; run;

*regressions;
proc sort data = h4_lagret_fund_select; by aut_rank categ autrankn fundid_mer yyyymm; run;
proc reg data = h4_lagret_fund_select outest = v_h4_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ autrankn fundid_mer; 
run; quit;

data v_h4_lagret_fund_reg;
set v_h4_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
*Part 1: Average within each group;

proc sort data = v_h4_lagret_fund_reg; by model_n aut_rank categ autrankn statis; run;
proc summary data = v_h4_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_h4_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ autrankn statis; 
run;

data wv_h4_lagret_fund_reg_avg1;
set v_h4_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;

data w_h4_lagret_fund_reg_avg_org;
set v_h4_lagret_fund_reg_avg wv_h4_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_h4_lagret_fund_reg_avg_org; by model_n aut_rank categ autrankn statis; run;

data w_lagret_fund_reg_avg_org_m1h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_m2345h4;
merge w_lagret_fund_reg_avg_org_m2h4 w_lagret_fund_reg_avg_org_m3h4 w_lagret_fund_reg_avg_org_m4h4 w_lagret_fund_reg_avg_org_m5h4;
type = 'univar';
run;


data h4_autoc_fund_r3_separate_grn&ngr;
set w_lagret_fund_reg_avg_org_m1h4 w_lagret_fund_reg_avg_m2345h4;
if type = 'univar';
if statis = "T" then delete;
run;

proc sort data = h4_autoc_fund_r3_separate_grn&ngr; by model_n aut_rank categ autrankn statis; run;


*Part 2: Average across each group;
proc sort data = v_h4_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_h4_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_h4_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_h4_lagret_fund_reg_avg1;
set v_h4_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;

data w_h4_lagret_fund_reg_avg_org;
set v_h4_lagret_fund_reg_avg wv_h4_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_h4_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5h4; set w_h4_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_m2345h4;
merge w_lagret_fund_reg_avg_org_m2h4 w_lagret_fund_reg_avg_org_m3h4 w_lagret_fund_reg_avg_org_m4h4 w_lagret_fund_reg_avg_org_m5h4;
type = 'univar';
run;

data h4_autoc_fund_r3_together_grn&ngr;
set w_lagret_fund_reg_avg_org_m1h4 w_lagret_fund_reg_avg_m2345h4;
if type = 'univar'; 
if statis = "T" then delete;
run;

proc sort data = h4_autoc_fund_r3_together_grn&ngr; by model_n aut_rank categ statis; run;

******************************;
*Aggregate Autocorrelation;

proc sort data = h4_lagret_fund_select_temp; by aut_rank autrankn categ yyyymm; run;
proc summary data = h4_lagret_fund_select_temp; 
var rets;
output out = h4_lagret_fund_select_ew mean = rets_ew;
by aut_rank autrankn categ yyyymm; 
run;

data h4_lagret_fund_select_ew;
set h4_lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data h4_lagret_fund_clean_ew; set h4_lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

*regressions;
proc sort data = h4_lagret_fund_clean_ew; by aut_rank autrankn categ yyyymm; run;
proc reg data = h4_lagret_fund_clean_ew outest = v_h4_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank autrankn categ; 
run; quit;

data w_h4_lagret_fund_reg_ew_org;
set v_h4_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank autrankn statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_h4_lagret_fund_reg_ew_org; by model_n aut_rank autrankn categ statis; run;

data w_lagret_fund_reg_ew_org_m1h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345h4;
merge w_lagret_fund_reg_ew_org_m2h4 w_lagret_fund_reg_ew_org_m3h4 w_lagret_fund_reg_ew_org_m4h4 w_lagret_fund_reg_ew_org_m5h4;
type = 'univar';
run;

data h4_autoc_aggr_r3_separate_grn&ngr;
set w_lagret_fund_reg_ew_org_m1h4 w_lagret_fund_reg_ew_org_m2345h4;
if type = 'univar';
run;

proc sort data = h4_autoc_aggr_r3_separate_grn&ngr; by model_n aut_rank autrankn statis; run;
data h4_autoc_aggr_r3_separate_grn&ngr; set h4_autoc_aggr_r3_separate_grn&ngr; if statis = "T" then delete; run;

*Overall Aggregate;

*regressions;
proc sort data = h4_lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = h4_lagret_fund_clean_ew outest = v_h4_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_h4_lagret_fund_reg_ew_org;
set v_h4_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_h4_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5h4; set w_h4_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345h4;
merge w_lagret_fund_reg_ew_org_m2h4 w_lagret_fund_reg_ew_org_m3h4 w_lagret_fund_reg_ew_org_m4h4 w_lagret_fund_reg_ew_org_m5h4;
type = 'univar';
run;

data h4_autoc_aggr_r3_together_grn&ngr;
set w_lagret_fund_reg_ew_org_m1h4 w_lagret_fund_reg_ew_org_m2345h4;
if type = 'univar';
run;

proc sort data = h4_autoc_aggr_r3_together_grn&ngr; by model_n aut_rank statis; run;
data h4_autoc_aggr_r3_together_grn&ngr; set h4_autoc_aggr_r3_together_grn&ngr; if statis = "T" then delete; run;


/***********************************************************/
*Aux test: all aggregate together;
proc sort data = h4_lagret_fund_select_temp; by aut_rank categ yyyymm; run;
proc summary data = h4_lagret_fund_select_temp; 
var rets;
output out = aux_h4_lagret_allew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data aux_h4_lagret_allew;
set aux_h4_lagret_allew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data aux_h4_lagret_allew_clean; set aux_h4_lagret_allew; if lag4_rets_ew ne . ; run;
proc sort data = aux_h4_lagret_allew_clean; by aut_rank categ yyyymm; run;
proc reg data = aux_h4_lagret_allew_clean outest = v_h4aux_autoc_aggr_grn&ngr noprint tableout;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;
data v_h4aux_autoc_aggr_grn&ngr;
set v_h4aux_autoc_aggr_grn&ngr;
if _TYPE_ = "PARMS";
drop rets_ew _RMSE_;
run;





*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
****Appendix for Code 2: Simulation to Verify Convergence/Stability/Bias Of MA Estimation Procedure;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;

/******************************************************************************************************************/
***************** Check PROC ARIMA/MA estimator convergence/unbiasedness;

*This simulation was run before the empirical analysis presented in the paper, as a way to ensure the MA estimation prodecure... ;
*... used (i.e., PROC ARIMA in SAS) would lead to unbiased results for datasets with characteristics similar to ours and under plausible DGP assumptions;
*...this applies to any MA-based unsmoothing, including GLM as well as the 3-step method;
*Brief explanation: in the simulation, we have two groups of funds, funds in group 1 hold liquid assets and their returns are not smoothed on average (true scaled MA(3) coefficients are 1 0 0 0, plus empirical variation);
*Funds in group 2 hold illiquid assets and report returns that are a function of true economic returns smoothed with MA(3) coefficients of 0.6 0.2 0.1 0.1;
*We recommend playing around and calibrating the MA estimation using a similar simulation before attempting to use the unsmoothing technique on real data;
*The simulation shows that the SAS PRC ARIMA procedure plus the rule of thumb shown below (look for MA3_status and MA3_flag) can effectively recover the MA(3) parameters for both groups of funds without knowledge of the true data generating process;
*The results suggest that the procedure will not tend to over- or under-unsmooth fund returns;
*(for users who do not have access to SAS ARIMA diagnostics, we find that the MA3_flag rule is a reasonably good approximation for using both MA3_status and MA3_flag when using a sample of hedge fund returns);

/*

*"MA3_status" => uses SAS's PROC ARIMA diagnostics for convergence/stability;
*"MA3_flag" => Simple rule of thumb: if any rescaled theta is >1.25 or <-0.45, then the MA process did not converge/does not fit the data;
*adj1 uses MA3_status, adj2 uses MA3_flag, adj3 used both MA3_status and MA3_flag;

%let th0 = 0.6;
%let th1 = 0.2;
%let th2 = 0.1;
%let th3 = 0.1;

***How many Loops;
%let n_loops = 100;


%macro simloop;

%do loopn = 1 %to &n_loops;

data simma00;
set hf00;
run;

proc sort data = simma00; by fundid_mer yyyymm; run;

data simbootrets;
set simma00;
*call streaminit(1);
call streaminit(&loopn);
sortv1 = rand("Uniform");
drawret = ret;
keep drawret sortv1;
run;
proc sort data = simbootrets; by sortv1; run;

data simma00;
merge simma00 simbootrets;
run;

data simma00;
set simma00;
lag1_drawret = lag1(drawret);
lag2_drawret = lag2(drawret);
lag3_drawret = lag3(drawret);
retsim1 = drawret;
retsim2 = &th0*drawret +&th1*lag1_drawret +&th2*lag2_drawret +&th3*lag3_drawret;
if fund_seq le 3 then retsim1 = . ;
if fund_seq le 3 then retsim2 = . ;
run;

proc sort data = simma00; by fundid_mer yyyymm; run;
proc summary data = simma00; 
var retsim1 retsim2;
output out = simma00_meanstd
mean = std = /autoname;
by fundid_mer; run;

*Estimate MA;
*de-mean simulated returns;
data simma00; 
merge simma00 simma00_meanstd; 
by fundid_mer;
if fund_seq le 3 then delete;
dem_retsim1 = retsim1 - retsim1_mean;
dem_retsim2 = retsim2 - retsim2_mean;
drop _TYPE_ _FREQ_;
run;


*MA with 3 lags, retsim1;
proc sort data = simma00; by fundid_mer yyyymm; run;
proc arima data= simma00;
identify var = dem_retsim1 noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= sim1_arima_ma3_est OUTSTAT= sim1_arima_ma3_diag noprint;
by fundid_mer;
run;
quit;

data sim1_arima_ma3_est;
set sim1_arima_ma3_est;
simv = 1;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
run;


*MA with 3 lags, retsim2;
proc sort data = simma00; by fundid_mer yyyymm; run;
proc arima data= simma00;
identify var = dem_retsim2 noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= sim2_arima_ma3_est OUTSTAT= sim2_arima_ma3_diag noprint;
by fundid_mer;
run;
quit;

data sim2_arima_ma3_est;
set sim2_arima_ma3_est;
simv = 2;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
run;


*Set estimates together;
data w_sim_arima_ma3_est_&loopn;
set sim1_arima_ma3_est sim2_arima_ma3_est;

true_th0 = 1;
true_th1 = 0;
true_th2 = 0;
true_th3 = 0;

if simv = 2 then do;
true_th0 = &th0;
true_th1 = &th1;
true_th2 = &th2;
true_th3 = &th3;
end;

th0_unadj = ma3_theta_0;
th1_unadj = ma3_theta_1;
th2_unadj = ma3_theta_2;
th3_unadj = ma3_theta_3;

th0_adj1 = ma3_theta_0;
th1_adj1 = ma3_theta_1;
th2_adj1 = ma3_theta_2;
th3_adj1 = ma3_theta_3;

th0_adj2 = ma3_theta_0;
th1_adj2 = ma3_theta_1;
th2_adj2 = ma3_theta_2;
th3_adj2 = ma3_theta_3;

th0_adj3 = ma3_theta_0;
th1_adj3 = ma3_theta_1;
th2_adj3 = ma3_theta_2;
th3_adj3 = ma3_theta_3;

ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;

ma3_flag = 0; 
if ma3_theta_0 gt 1.25 or ma3_theta_1 gt 1.25 or ma3_theta_2 gt 1.25 or ma3_theta_3 gt 1.25 then ma3_flag = 1;
if ma3_theta_0 le -0.45 or ma3_theta_1 le -0.45 or ma3_theta_2 le -0.45 or ma3_theta_3 le -0.45 then ma3_flag = 1;

if ma3_status = 1 then do;
th0_adj1 = 1; 
th1_adj1 = 0; 
th2_adj1 = 0; 
th3_adj1 = 0;
end;

if ma3_flag = 1 then do;
th0_adj2 = 1; 
th1_adj2 = 0; 
th2_adj2 = 0; 
th3_adj2 = 0;
end;

if ma3_status = 1 or ma3_flag = 1 then do;
th0_adj3 = 1; 
th1_adj3 = 0; 
th2_adj3 = 0; 
th3_adj3 = 0;
end;

abse_th0_unadj = abs(th0_unadj - true_th0);
abse_th1_unadj = abs(th1_unadj - true_th1);
abse_th2_unadj = abs(th2_unadj - true_th2);
abse_th3_unadj = abs(th3_unadj - true_th3);

abse_th0_adj1 = abs(th0_adj1 - true_th0);
abse_th1_adj1 = abs(th1_adj1 - true_th1);
abse_th2_adj1 = abs(th2_adj1 - true_th2);
abse_th3_adj1 = abs(th3_adj1 - true_th3);

abse_th0_adj2 = abs(th0_adj2 - true_th0);
abse_th1_adj2 = abs(th1_adj2 - true_th1);
abse_th2_adj2 = abs(th2_adj2 - true_th2);
abse_th3_adj2 = abs(th3_adj2 - true_th3);

abse_th0_adj3 = abs(th0_adj3 - true_th0);
abse_th1_adj3 = abs(th1_adj3 - true_th1);
abse_th2_adj3 = abs(th2_adj3 - true_th2);
abse_th3_adj3 = abs(th3_adj3 - true_th3);

diff_th0_unadj = th0_unadj - true_th0;
diff_th0_adj1 = th0_adj1 - true_th0;
diff_th0_adj2 = th0_adj2 - true_th0;
diff_th0_adj3 = th0_adj3 - true_th0;

diff_th1_unadj = th1_unadj - true_th1;
diff_th1_adj1 = th1_adj1 - true_th1;
diff_th1_adj2 = th1_adj2 - true_th1;
diff_th1_adj3 = th1_adj3 - true_th1;

diff_th2_unadj = th2_unadj - true_th2;
diff_th2_adj1 = th2_adj1 - true_th2;
diff_th2_adj2 = th2_adj2 - true_th2;
diff_th2_adj3 = th2_adj3 - true_th2;

diff_th3_unadj = th3_unadj - true_th3;
diff_th3_adj1 = th3_adj1 - true_th3;
diff_th3_adj2 = th3_adj2 - true_th3;
diff_th3_adj3 = th3_adj3 - true_th3;

run;

%end;


data sim_arima_ma3_est;
set %do loopn = 1 %to &n_loops;
w_sim_arima_ma3_est_&loopn
%end; ;
run;


proc summary data = sim_arima_ma3_est;
class simv;
var th0_unadj th0_adj1 th0_adj2 th0_adj3;
output out = sim_check_th0
mean = median = p1 = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var abse_th0_unadj abse_th0_adj1 abse_th0_adj2 abse_th0_adj3;
output out = sim_check_abse_th0
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var abse_th1_unadj abse_th1_adj1 abse_th1_adj2 abse_th1_adj3;
output out = sim_check_abse_th1
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var abse_th2_unadj abse_th2_adj1 abse_th2_adj2 abse_th2_adj3;
output out = sim_check_abse_th2
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var abse_th3_unadj abse_th3_adj1 abse_th3_adj2 abse_th3_adj3;
output out = sim_check_abse_th3
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var diff_th0_unadj diff_th0_adj1 diff_th0_adj2 diff_th0_adj3;
output out = sim_check_bias_th0
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var diff_th1_unadj diff_th1_adj1 diff_th1_adj2 diff_th1_adj3;
output out = sim_check_bias_th1
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var diff_th2_unadj diff_th2_adj1 diff_th2_adj2 diff_th2_adj3;
output out = sim_check_bias_th2
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var diff_th3_unadj diff_th3_adj1 diff_th3_adj2 diff_th3_adj3;
output out = sim_check_bias_th3
mean = median = p99 = /autoname;
run;

proc summary data = sim_arima_ma3_est;
class simv;
var true_th0 th0_unadj th0_adj1 th0_adj2 th0_adj3
true_th1 th1_unadj th1_adj1 th1_adj2 th1_adj3
true_th2 th2_unadj th2_adj1 th2_adj2 th2_adj3
true_th3 th3_unadj th3_adj1 th3_adj2 th3_adj3;
output out = sim_check_avg_th
mean = true_th0 th0_unadj th0_adj1 th0_adj2 th0_adj3
true_th1 th1_unadj th1_adj1 th1_adj2 th1_adj3
true_th2 th2_unadj th2_adj1 th2_adj2 th2_adj3
true_th3 th3_unadj th3_adj1 th3_adj2 th3_adj3;
run;


%mend simloop;
%simloop;

*/








*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 3: ANALYSIS OF BETA-SORTED HEDGE FUND PORTFOLIO RETURNS;
****Produces: Table 4;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;

***** Test reported in Table 4: Define hedge fund categories by loadings on risk factors;
***The following code should be run subsequenly to running the main HF code;

**********************************************************************************************************************************;
*Form strategies based on factor betas;
*low 30, mid 40, top 30;

*Factor Coefficients are in v_s3_fund_fh00_coef_all;

*Obtain entry year, for each fund;
data w_au_year;
set s3_00a; 
first_yyyy = round(yyyymm/100,1);
keep fundid_mer first_yyyy;
run;
proc sort data = w_au_year; by fundid_mer first_yyyy; run;
proc sort data = w_au_year nodupkey; by fundid_mer; run;

*attach entry year to dataset with coefficients;
proc sort data = v_s3_fund_fh00_coef_all; by fundid_mer; run;

data betacat00;
merge v_s3_fund_fh00_coef_all(in=a) w_au_year(in=b);
by fundid_mer; 
if a and b;
run;

*also add Cdimson betas;
proc sort data = betacat00; by fundid_mer; run;
proc sort data = r0_cd3_coeff; by fundid_mer; run;

data betacat00;
merge betacat00(in=a) r0_cd3_coeff;
by fundid_mer; 
if a;
run;

*Select sample;
*Select betas to use;
data betacat00;
set betacat00;
*select regular or CDimson betas for sorting (results are extremely similar);
*note to self: results are reported using the "Cdimson" beta sort to be follow referees requests;
*Regular;
*r0_fh1 = r0_fh1;
*r0_fh2 = r0_fh2;
*r0_fh3 = r0_fh3;
*r0_fh4 = r0_fh4;
*r0_fh5 = r0_fh5;
*r0_fh6 = r0_fh6;
*r0_fh7 = r0_fh7;
*r0_fh8 = r0_fh8;
*CDimson;
if r0cd3_best_fh1 ne . then r0_fh1 = r0cd3_best_fh1;
if r0cd3_best_fh2 ne . then r0_fh2 = r0cd3_best_fh2;
if r0cd3_best_fh3 ne . then r0_fh3 = r0cd3_best_fh3;
if r0cd3_best_fh4 ne . then r0_fh4 = r0cd3_best_fh4;
if r0cd3_best_fh5 ne . then r0_fh5 = r0cd3_best_fh5;
if r0cd3_best_fh6 ne . then r0_fh6 = r0cd3_best_fh6;
if r0cd3_best_fh7 ne . then r0_fh7 = r0cd3_best_fh7;
if r0cd3_best_fh8 ne . then r0_fh8 = r0cd3_best_fh8;
run;


*Beta category breakpoints;
proc sort data = betacat00; by first_yyyy; run;
proc summary data = betacat00; 
var r0_fh1 r0_fh2 r0_fh3 r0_fh4 r0_fh5 r0_fh6 r0_fh7 r0_fh8;
output out = v_betas_perc
p30 = p70 = /autoname;
by first_yyyy; 
run;

data betacat00;
merge betacat00 v_betas_perc;
by first_yyyy; 
run;

*categories based on breakpoints;
data betacat00;
set betacat00;
*fh1;
fh1g = 1;
if r0_fh1 gt r0_fh1_p30 then fh1g = 2;
if r0_fh1 ge r0_fh1_p70 then fh1g = 3;
*fh2;
fh2g = 1;
if r0_fh2 gt r0_fh2_p30 then fh2g = 2;
if r0_fh2 ge r0_fh2_p70 then fh2g = 3;
*fh3;
fh3g = 1;
if r0_fh3 gt r0_fh3_p30 then fh3g = 2;
if r0_fh3 ge r0_fh3_p70 then fh3g = 3;
*fh4;
fh4g = 1;
if r0_fh4 gt r0_fh4_p30 then fh4g = 2;
if r0_fh4 ge r0_fh4_p70 then fh4g = 3;
*fh5;
fh5g = 1;
if r0_fh5 gt r0_fh5_p30 then fh5g = 2;
if r0_fh5 ge r0_fh5_p70 then fh5g = 3;
*fh6;
fh6g = 1;
if r0_fh6 gt r0_fh6_p30 then fh6g = 2;
if r0_fh6 ge r0_fh6_p70 then fh6g = 3;
*fh7;
fh7g = 1;
if r0_fh7 gt r0_fh7_p30 then fh7g = 2;
if r0_fh7 ge r0_fh7_p70 then fh7g = 3;
*fh8;
fh8g = 1;
if r0_fh8 gt r0_fh8_p30 then fh8g = 2;
if r0_fh8 ge r0_fh8_p70 then fh8g = 3;
run;

*average betas by beta-sorted categories;
*fh1 category;
proc sort data = betacat00; by fh1g; run;
proc summary data = betacat00; 
var r0_fh1 r1_fh1 r3_fh1 aut_rank;
output out = b_betas_by_fh1g
mean = /autoname;
by fh1g; 
run;
*fh2 category;
proc sort data = betacat00; by fh2g; run;
proc summary data = betacat00; 
var r0_fh2 r1_fh2 r3_fh2 aut_rank;
output out = b_betas_by_fh2g
mean = /autoname;
by fh2g; 
run;
*fh3 category;
proc sort data = betacat00; by fh3g; run;
proc summary data = betacat00; 
var r0_fh3 r1_fh3 r3_fh3 aut_rank;
output out = b_betas_by_fh3g
mean = /autoname;
by fh3g; 
run;
*fh4 category;
proc sort data = betacat00; by fh4g; run;
proc summary data = betacat00; 
var r0_fh4 r1_fh4 r3_fh4 aut_rank;
output out = b_betas_by_fh4g
mean = /autoname;
by fh4g; 
run;
*fh5 category;
proc sort data = betacat00; by fh5g; run;
proc summary data = betacat00; 
var r0_fh5 r1_fh5 r3_fh5 aut_rank;
output out = b_betas_by_fh5g
mean = /autoname;
by fh5g; 
run;
*fh6 category;
proc sort data = betacat00; by fh6g; run;
proc summary data = betacat00; 
var r0_fh6 r1_fh6 r3_fh6 aut_rank;
output out = b_betas_by_fh6g
mean = /autoname;
by fh6g; 
run;
*fh7 category;
proc sort data = betacat00; by fh7g; run;
proc summary data = betacat00; 
var r0_fh7 r1_fh7 r3_fh7 aut_rank;
output out = b_betas_by_fh7g
mean = /autoname;
by fh7g; 
run;
*fh8 category;
proc sort data = betacat00; by fh8g; run;
proc summary data = betacat00; 
var r0_fh8 r1_fh8 r3_fh8 aut_rank;
output out = b_betas_by_fh8g
mean = /autoname;
by fh8g; 
run;

*attach beta-based categories to file with fund returns;
data v_betacat_list1; 
set betacat00; 
sort_fh1 = fh1g;
sort_fh2 = fh2g;
sort_fh3 = fh3g;
sort_fh4 = fh4g;
sort_fh5 = fh5g;
sort_fh6 = fh6g;
sort_fh7 = fh7g;
sort_fh8 = fh8g;
keep fundid_mer first_yyyy
fh1g fh2g fh3g fh4g fh5g fh6g fh7g fh8g
sort_fh1 sort_fh2 sort_fh3 sort_fh4
sort_fh5 sort_fh6 sort_fh7 sort_fh8;
run;

*data is in lagret_fund;
proc sort data = lagret_fund; by fundid_mer; run;
proc sort data = v_betacat_list1; by fundid_mer; run;

*merge;
data betacat01;
merge lagret_fund(in=a) v_betacat_list1; 
by fundid_mer; 
if a;
run;

*************MACRO: Run analysis for the 8 beta-sorted categories;
%macro beta_sort;

data betacat01_samp;
set betacat01;
run;

*select factor to use;
%do ii = 1 %to 8;

****Now aggregate;
proc sort data = betacat01_samp; by sort_fh&ii yyyymm; run;
proc summary data = betacat01_samp; 
var ret glm_ret s3_ret;
output out = betacat01_ewret_sel
mean = ewr0 ewr1 ewr3;
by sort_fh&ii yyyymm; 
run;

data betacat01_ewret_sel;
set betacat01_ewret_sel;
*lag r0;
lag1_ewr0 = lag1(ewr0);
if yyyymm le 199501 then lag1_ewr0 = . ;
lag2_ewr0 = lag2(ewr0);
if yyyymm le 199502 then lag2_ewr0 = . ;
lag3_ewr0 = lag3(ewr0);
if yyyymm le 199503 then lag3_ewr0 = . ;
lag4_ewr0 = lag4(ewr0);
if yyyymm le 199504 then lag4_ewr0 = . ;
*lag r1;
lag1_ewr1 = lag1(ewr1);
if yyyymm le 199501 then lag1_ewr1 = . ;
lag2_ewr1 = lag2(ewr1);
if yyyymm le 199502 then lag2_ewr1 = . ;
lag3_ewr1 = lag3(ewr1);
if yyyymm le 199503 then lag3_ewr1 = . ;
lag4_ewr1 = lag4(ewr1);
if yyyymm le 199504 then lag4_ewr1 = . ;
*lag r3;
lag1_ewr3 = lag1(ewr3);
if yyyymm le 199501 then lag1_ewr3 = . ;
lag2_ewr3 = lag2(ewr3);
if yyyymm le 199502 then lag2_ewr3 = . ;
lag3_ewr3 = lag3(ewr3);
if yyyymm le 199503 then lag3_ewr3 = . ;
lag4_ewr3 = lag4(ewr3);
if yyyymm le 199504 then lag4_ewr3 = . ;
drop _TYPE_ _FREQ_;
run;

data betacat01_ewret_sel; set betacat01_ewret_sel; if yyyymm ge 199505; run;

proc sort data = betacat01_ewret_sel; by sort_fh&ii yyyymm; run;
proc reg data = betacat01_ewret_sel outest = v_betacat01_ewret_sel noprint tableout;
*unavariate;
*R0;
model ewr0 = lag1_ewr0 /edf ADJRSQ;
model ewr0 = lag2_ewr0 /edf ADJRSQ;
model ewr0 = lag3_ewr0 /edf ADJRSQ;
model ewr0 = lag4_ewr0 /edf ADJRSQ;
*R1;
model ewr1 = lag1_ewr1 /edf ADJRSQ;
model ewr1 = lag2_ewr1 /edf ADJRSQ;
model ewr1 = lag3_ewr1 /edf ADJRSQ;
model ewr1 = lag4_ewr1 /edf ADJRSQ;
*R3;
model ewr3 = lag1_ewr3 /edf ADJRSQ;
model ewr3 = lag2_ewr3 /edf ADJRSQ;
model ewr3 = lag3_ewr3 /edf ADJRSQ;
model ewr3 = lag4_ewr3 /edf ADJRSQ;
by sort_fh&ii; 
run; quit;

*Re-arrange data;
*clean;
data v_betacat01_clean_all&ii;
set v_betacat01_ewret_sel;
if _TYPE_ = "PARMS" or _TYPE_ = "PVALUE";
drop _DEPVAR_ _RMSE_ Intercept ewr0 ewr1 ewr3
_IN_ _P_ _EDF_ _RSQ_ _ADJRSQ_ _MODEL_;
run;

*make output for the table;

*Column 1: low beta, R0;
data w_tab_col1;
set v_betacat01_clean_all&ii;
sortfactor = &ii;
statis = _TYPE_;
if sort_fh&ii = 1;
if lag1_ewr0 ne . ;
corr1_lowbeta_r0 = lag1_ewr0;
keep sortfactor corr1_lowbeta_r0 statis;
run;
*Column 2: mid beta, R0;
data w_tab_col2;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 2;
if lag1_ewr0 ne . ;
corr1_midbeta_r0 = lag1_ewr0;
keep corr1_midbeta_r0;
run;
*Column 2: high beta, R0;
data w_tab_col3;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 3;
if lag1_ewr0 ne . ;
corr1_higbeta_r0 = lag1_ewr0;
keep corr1_higbeta_r0;
run;
*Column 4: low beta, R1;
data w_tab_col4;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 1;
if lag1_ewr1 ne . ;
corr1_lowbeta_r1 = lag1_ewr1;
keep corr1_lowbeta_r1;
run;
*Column 5: mid beta, R1;
data w_tab_col5;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 2;
if lag1_ewr1 ne . ;
corr1_midbeta_r1 = lag1_ewr1;
keep corr1_midbeta_r1;
run;
*Column 6: high beta, R1;
data w_tab_col6;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 3;
if lag1_ewr1 ne . ;
corr1_higbeta_r1 = lag1_ewr1;
keep corr1_higbeta_r1;
run;
*Column 7: low beta, R3;
data w_tab_col7;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 1;
if lag1_ewr3 ne . ;
corr1_lowbeta_r3 = lag1_ewr3;
keep corr1_lowbeta_r3;
run;
*Column 8: mid beta, R3;
data w_tab_col8;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 2;
if lag1_ewr3 ne . ;
corr1_midbeta_r3 = lag1_ewr3;
keep corr1_midbeta_r3;
run;
*Column 9: high beta, R3;
data w_tab_col9;
set v_betacat01_clean_all&ii;
statis = _TYPE_;
if sort_fh&ii = 3;
if lag1_ewr3 ne . ;
corr1_higbeta_r3 = lag1_ewr3;
keep corr1_higbeta_r3;
run;

*Set together;
data v_tab_corr1_&ii;
merge w_tab_col1 w_tab_col2 w_tab_col3 w_tab_col4
w_tab_col5 w_tab_col6 w_tab_col7 w_tab_col8 w_tab_col9;
run;

%end;

*Set together;
*main table;
data tab_main_corr1;
set %do ii = 1 %to 8; v_tab_corr1_&ii %end; ;
run;

*table with all coefficients;
data tab_all_4lags;
set %do ii = 1 %to 8; v_betacat01_clean_all&ii %end; ;
run;



%mend beta_sort;
%beta_sort;


**************Clean for output;
data vtab_all_4lags_clean;
set tab_all_4lags;
statis = _TYPE_;
if sort_fh1 = . then sort_fh1 = 99;
if sort_fh2 = . then sort_fh2 = 99;
if sort_fh3 = . then sort_fh3 = 99;
if sort_fh4 = . then sort_fh4 = 99;
if sort_fh5 = . then sort_fh5 = 99;
if sort_fh6 = . then sort_fh6 = 99;
if sort_fh7 = . then sort_fh7 = 99;
if sort_fh8 = . then sort_fh8 = 99;
run;

proc sort data = vtab_all_4lags_clean; by sort_fh1 sort_fh2 sort_fh3 sort_fh4 sort_fh5 sort_fh6 sort_fh7 sort_fh8 statis; run;
proc summary data = vtab_all_4lags_clean; 
var lag1_ewr0 lag1_ewr1 lag1_ewr3;
output out = tab_all_4lags_clean
mean = R0_cor1 R1_cor1 R3_cor1;
by sort_fh1 sort_fh2 sort_fh3 sort_fh4 sort_fh5 sort_fh6 sort_fh7 sort_fh8 statis; 
run;

data tab_all_4lags_clean;
set tab_all_4lags_clean;
drop _TYPE_ _FREQ_;
run;






*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 4: PREDICTABILITY OF FUND ALPHAS;
****Produces: Figure 3;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;


******** Code for the Performance Persistence Results;
*** This code produces results reported in Figure 3 (as well as Internet Appendix IA.8, IA.9 and IA.10);
*** The code is organized in 2 steps;
*** Step 1 computes rolling-windows alphas according to various methods (using only fund returns up to time t-1);
*** Step 2 forms portfolios of funds at time t based on lagged alphas, and then compute the the alphas of these portfolios;

*******************************************************************************************************************************;
*******************************************************************************************************************************;

*** Step 1 computes rolling-windows alphas according to various methods (using only fund returns up to time t-1);
*** Main test uses alphas from past 24 months;

options nonotes nosource nosource2 errors=0;
options notes source source2 errors=2;

%macro ODSOff(); 
ods graphics off;
ods exclude all;
ods noresults;
%mend;
 
%macro ODSOn(); 
ods graphics on;
ods exclude none;
ods results;
%mend;

*The code starts from raw hedge fund returns and unsmoothes them;

data hf00_all;
set hfautoc.hf_merge00_july2020_min5;
keep fundid_mer ret yyyymm assets_fill fund_type stra bh_fund;
run;

*lag assets;
proc sort data = hf00_all; by fundid_mer yyyymm; run;

data hf00_all;
set hf00_all;
lag_assets_fill = lag(assets_fill);
if fundid_mer ne lag(fundid_mer) then lag_assets_fill = .;
run;

***Attach fund classifications;
* e.g., Relative Value, Event Driven, etc;
*This follows the classification table from Joenvaara et al 2021 (also see appendix A1);
proc import out = work.Strat_manual_1
datafile = "C:\Users\&pcname.\Dropbox\Research\Hedge Funds\Unsmoothing Returns\Hedge Fund Analysis\Summer_Revision\Strat_manual_1.xlsx"
dbms =xlsx replace; getnames = yes; run;

proc sort data = hf00_all; by fund_type stra; run;
proc sort data = strat_manual_1; by fund_type stra; run;

data hf00_all;
merge hf00_all strat_manual_1; 
by fund_type stra; 
categ = jkt_category;
drop jkt_category;
run;

data hf00_all;
set hf00_all;
if categ = 'Other' then delete;
aut_rank = 1;
if categ = 'Event_driven' then aut_rank = 2;
if categ = 'Multi_strategy' then aut_rank = 3;
if categ = 'Emerging_Markets' then aut_rank = 4;
if categ = 'Sector' then aut_rank = 5;
if categ = 'Long_Only' then aut_rank = 6;
if categ = 'Long_Short' then aut_rank = 7;
if categ = 'Market_Neutral' then aut_rank = 8;
if categ = 'Global Macro' then aut_rank = 9;
if categ = 'CTA' then aut_rank = 10;
if categ = 'FOF' then delete;
run;

data hf00_all;
set hf00_all;
if yyyymm ge 199501;
if yyyymm le 201712;
run;

*Get factors;
data hffact;
set hfautoc.hf_factors;
fh1 = sp500_rf;
fh2 = size_spread;
fh3 = emerg_mkt_rf;
fh4 = FS_bond_mkt;
fh5 = FS_credit_sprd;
fh6 = PTFSBD;
fh7 = PTFSFX;
fh8 = PTFSCOM;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 rf;
run;

*lag factors;
proc sort data = hffact; by yyyymm; run;

data hffact;
set hffact;
l1_fh1 = lag1(fh1);
l1_fh2 = lag1(fh2);
l1_fh3 = lag1(fh3);
l1_fh4 = lag1(fh4);
l1_fh5 = lag1(fh5);
l1_fh6 = lag1(fh6);
l1_fh7 = lag1(fh7);
l1_fh8 = lag1(fh8);

l2_fh1 = lag2(fh1);
l2_fh2 = lag2(fh2);
l2_fh3 = lag2(fh3);
l2_fh4 = lag2(fh4);
l2_fh5 = lag2(fh5);
l2_fh6 = lag2(fh6);
l2_fh7 = lag2(fh7);
l2_fh8 = lag2(fh8);
run;


*Backfill date;
data tass_imputed_add_date1; 
set hfautoc.tass_imputed_add_date; 
fundid_mer = fundid*100 + 1;
keep fundid_mer imputed_add_date;
run;

data bh_imputed_add_date1; 
set hfautoc.bh_imputed_add_date; 
fundid_mer = fund_id*100 + 2;
keep fundid_mer imputed_add_date;
run;

data imputed_add_dates; 
set tass_imputed_add_date1 bh_imputed_add_date1; 
run;


/************************************************************************************/
*** Loop to get unsmmothed returns only using past returns;

%let yystart = 2000; *start in 2000;
%let yyend = 2017; *end in 2017;

%macro rollunsm;


%do yearto = &yystart %to &yyend;


data hf00;
set hf00_all;
if yyyymm le %eval(&yearto*100 + 12);
run;


proc sort data = hf00; by fundid_mer; run;

proc summary data = hf00; 
var ret;
output out = w_mean_ret_1 mean = av_ret_fundid std = std_ret_fundid;
by fundid_mer; run;

data hf00; 
merge hf00 w_mean_ret_1;
by fundid_mer; 
fundid_mer_obs = _FREQ_;
if fundid_mer_obs ge 36;
drop _FREQ_ _TYPE_;
run;

proc sort data = hf00; by fundid_mer yyyymm; run;

data hf00; 
set hf00;
by fundid_mer;
if first.fundid_mer then fund_seq = 1;
else fund_seq + 1;
run; 

proc sort data = hf00; by yyyymm categ; run;
proc summary data = hf00; 
var ret;
output out = w_categ_mm_n mean = av_ret_categ;
by yyyymm categ; run;

data w_count_categ_funds; set hf00; keep fundid_mer ret categ; run;
proc sort data = w_count_categ_funds nodupkey; by categ fundid_mer; run;

proc summary data = w_count_categ_funds; 
var ret;
output out = w_categ_n mean = av_ret_categ;
by categ; run;


/******************************************************************************************************************/
*Part 1: 1-step unsmoothing as in GLM(2004);

*De-meaning fund return;
data hf00; 
set hf00; 
dem_ret = ret - av_ret_fundid;
run;

*MA with 3 lags;
proc sort data = hf00; by fundid_mer yyyymm; run;
proc arima data= hf00 /*out = arima_ma2_est_out*/;
identify var = dem_ret noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= arima_ma3_est OUTSTAT=arima_ma3_diag noprint;
estimate q= 2 noint ma = -0.2 -0.2 method=ml OUTEST= arima_ma2_est OUTSTAT=arima_ma2_diag noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= arima_ma1_est OUTSTAT=arima_ma1_diag noprint;
estimate q= 0 noint method=ml OUTEST= arima_ma0_est OUTSTAT=arima_ma0_diag noprint;
*forecast noprint;
by fundid_mer;
run;
quit;

*Compare AIC from the 4 possible models;
data arima_ma0_diag; set arima_ma0_diag; ma0_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data arima_ma1_diag; set arima_ma1_diag; ma1_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data arima_ma2_diag; set arima_ma2_diag; ma2_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data arima_ma3_diag; set arima_ma3_diag; ma3_value = _VALUE_; drop _TYPE_ _VALUE_; run;

data hf00_diag; set hf00; keep fundid_mer categ aut_rank; run;
proc sort data = hf00_diag nodupkey; by fundid_mer; run;

data hf00_diag;
merge hf00_diag arima_ma0_diag arima_ma1_diag arima_ma2_diag arima_ma3_diag;
by fundid_mer;
if _STAT_ = "AIC" or _STAT_ = "LOGLIK" or _STAT_ = "CONV" or _STAT_ = "ERRORVAR";
run;

data hf00_diag; *general step to identify minimum;
set hf00_diag;
ma0_win = 0; ma1_win = 0; ma2_win = 0; ma3_win = 0;
if ma0_value = min(ma0_value,ma1_value,ma2_value,ma3_value) then ma0_win = 1;
if ma1_value = min(ma0_value,ma1_value,ma2_value,ma3_value) then ma1_win = 1;
if ma2_value = min(ma0_value,ma1_value,ma2_value,ma3_value) then ma2_win = 1;
if ma3_value = min(ma0_value,ma1_value,ma2_value,ma3_value) then ma3_win = 1;
run;

*Estimated thetas for case with 1, 2 and 3 MA lags;
data arima_ma3_est;
set arima_ma3_est;
if _TYPE_ = "EST";
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
keep fundid_mer ma3_STATUS_ ma3_theta_0 ma3_theta_1 ma3_theta_2 ma3_theta_3 ma3_theta_sum;
run;

data arima_ma2_est;
set arima_ma2_est;
if _TYPE_ = "EST";
ma2_STATUS_ = _STATUS_;
ma2_theta_sum = 1 - ma1_1 - ma1_2;
ma2_theta_0 = 1/ma2_theta_sum;
ma2_theta_1 = -ma1_1/ma2_theta_sum;
ma2_theta_2 = -ma1_2/ma2_theta_sum;
ma2_theta_sum_norm = ma2_theta_0 + ma2_theta_1 + ma2_theta_2;
keep fundid_mer ma2_STATUS_ ma2_theta_0 ma2_theta_1 ma2_theta_2 ma2_theta_sum;
run;

data arima_ma1_est;
set arima_ma1_est;
if _TYPE_ = "EST";
ma1_STATUS_ = _STATUS_;
ma1_theta_sum = 1 - ma1_1;
ma1_theta_0 = 1/ma1_theta_sum;
ma1_theta_1 = -ma1_1/ma1_theta_sum;
ma1_theta_sum_norm = ma1_theta_0 + ma1_theta_1;
keep fundid_mer ma1_STATUS_ ma1_theta_0 ma1_theta_1 ma1_theta_sum;
run;

*attach thetas to main dataset;
data hf01;
merge hf00 arima_ma1_est arima_ma2_est arima_ma3_est;
by fundid_mer;
run;

*ADD AIC Estimate of best fit model;
data hf00_diag_aic; 
set hf00_diag;
if _STAT_ = "AIC";
ma0_aic = ma0_value;
ma1_aic = ma1_value;
ma2_aic = ma2_value;
ma3_aic = ma3_value;
keep fundid_mer _STAT_ ma0_aic ma1_aic ma2_aic ma3_aic;
run;

proc sort data = hf00_diag_aic; by fundid_mer; run;
proc sort data = hf01; by fundid_mer; run;

data hf01; 
merge hf01 hf00_diag_aic; 
by fundid_mer; 
run;

**Add convergence status;
data hf00_diag_conv; 
set hf00_diag;
if _STAT_ = "CONV";
ma0_conv = ma0_value;
ma1_conv = ma1_value;
ma2_conv = ma2_value;
ma3_conv = ma3_value;
keep fundid_mer ma0_conv ma1_conv ma2_conv ma3_conv;
run;
proc sort data = hf00_diag_conv; by fundid_mer; run;
proc sort data = hf01; by fundid_mer; run;

data hf01; 
merge hf01 hf00_diag_conv; 
by fundid_mer; 
run;

**Add error variance;
data hf00_diag_errorvar; 
set hf00_diag;
if _STAT_ = "ERRORVAR";
ma0_errorvar = ma0_value;
ma1_errorvar = ma1_value;
ma2_errorvar = ma2_value;
ma3_errorvar = ma3_value;
keep fundid_mer ma0_errorvar ma1_errorvar ma2_errorvar ma3_errorvar;
run;
proc sort data = hf00_diag_errorvar; by fundid_mer; run;
proc sort data = hf01; by fundid_mer; run;

data hf01; 
merge hf01 hf00_diag_errorvar; 
by fundid_mer; 
run;


*Now use ARIMA statistics to determine best model;
*In case of non-convergence, set to MA(0) (i.e., do not unsmooth, but do not drop fund to maintain same sample across methods);

data hf01;
set hf01;
ma1_gt_1_5 = 0; 
if ma1_theta_0 gt 1.25 or ma1_theta_1 gt 1.25 then ma1_gt_1_5 = 1;
if ma1_theta_0 le -0.45 or ma1_theta_1 le -0.45 then ma1_gt_1_5 = 1;
ma2_gt_1_5 = 0; 
if ma2_theta_0 gt 1.25 or ma2_theta_1 gt 1.25 or ma2_theta_2 gt 1.25 then ma2_gt_1_5 = 1;
if ma2_theta_0 le -0.45 or ma2_theta_1 le -0.45 or ma2_theta_2 le -0.45 then ma2_gt_1_5 = 1;
ma3_gt_1_5 = 0; 
if ma3_theta_0 gt 1.25 or ma3_theta_1 gt 1.25 or ma3_theta_2 gt 1.25 or ma3_theta_3 gt 1.5 then ma3_gt_1_5 = 1;
if ma3_theta_0 le -0.45 or ma3_theta_1 le -0.45 or ma3_theta_2 le -0.45 or ma3_theta_3 le -0.45 then ma3_gt_1_5 = 1;
ma0_aic_adj1 = ma0_aic;
ma1_aic_adj1 = ma1_aic; if ma1_gt_1_5 = 1 then ma1_aic_adj1 = 99999;
ma2_aic_adj1 = ma2_aic; if ma2_gt_1_5 = 1 then ma2_aic_adj1 = 99999;
ma3_aic_adj1 = ma3_aic; if ma3_gt_1_5 = 1 then ma3_aic_adj1 = 99999;
if ma1_conv ge 1 then ma1_aic_adj1 = 99999;
if ma2_conv ge 1 then ma2_aic_adj1 = 99999;
if ma3_conv ge 1 then ma3_aic_adj1 = 99999;
aic_adj1_win = 0;
if ma1_aic_adj1 = min(ma0_aic_adj1,ma1_aic_adj1,ma2_aic_adj1,ma3_aic_adj1) then aic_adj1_win = 1;
if ma2_aic_adj1 = min(ma0_aic_adj1,ma1_aic_adj1,ma2_aic_adj1,ma3_aic_adj1) then aic_adj1_win = 2;
if ma3_aic_adj1 = min(ma0_aic_adj1,ma1_aic_adj1,ma2_aic_adj1,ma3_aic_adj1) then aic_adj1_win = 3;
aic_theta_0 = 1; aic_theta_1 = 0; aic_theta_2 = 0; aic_theta_3 = 0;
if aic_adj1_win = 1 then aic_theta_0 = ma1_theta_0;
if aic_adj1_win = 1 then aic_theta_1 = ma1_theta_1;
if aic_adj1_win = 2 then aic_theta_0 = ma2_theta_0;
if aic_adj1_win = 2 then aic_theta_1 = ma2_theta_1;
if aic_adj1_win = 2 then aic_theta_2 = ma2_theta_2;
if aic_adj1_win = 3 then aic_theta_0 = ma3_theta_0;
if aic_adj1_win = 3 then aic_theta_1 = ma3_theta_1;
if aic_adj1_win = 3 then aic_theta_2 = ma3_theta_2;
if aic_adj1_win = 3 then aic_theta_3 = ma3_theta_3;
run;

data v_check_thetas; set hf01; run;
proc sort data = v_check_thetas nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = v_check_thetas; 
var aic_adj1_win aic_theta_0 aic_theta_1 aic_theta_2 aic_theta_3;
output out = v_av_thetas
mean = aic_adj1_win aic_theta_0 aic_theta_1 aic_theta_2 aic_theta_3;
by aut_rank categ; 
run;



/********************************************************/
*back out estimated unsmoothed returns;

proc sort data = hf01; by fundid_mer yyyymm; run;



data hf01_loop;
set hf01;
backed_ret_aic = dem_ret;
keep backed_ret_aic
dem_ret fundid_mer yyyymm fund_seq av_ret_fundid
aic_theta_0 aic_theta_1 aic_theta_2 aic_theta_3;
run;

proc sort data = hf01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *AIC MA3 loop;

data hf01_loop;
set hf01_loop;
lag1_backed_ret_aic = lag1(backed_ret_aic);
lag2_backed_ret_aic = lag2(backed_ret_aic);
lag3_backed_ret_aic = lag3(backed_ret_aic);

if fund_seq = &i then backed_ret_aic = (dem_ret - aic_theta_1*lag1_backed_ret_aic - aic_theta_2*lag2_backed_ret_aic - aic_theta_3*lag3_backed_ret_aic)/aic_theta_0;
run;

%end;



*clean dataset and add back mean;
data hf01_loop;
set hf01_loop;
temp_backed_ret_aic = backed_ret_aic + av_ret_fundid;
keep fundid_mer yyyymm temp_backed_ret_aic;
run;

proc sort data = hf01_loop; by fundid_mer yyyymm; run;
proc sort data = hf01; by fundid_mer yyyymm; run;

data hf02;
merge hf01 hf01_loop; 
by fundid_mer yyyymm; 
run;

*Adjust mean;
proc sort data = hf02; by fundid_mer; run;
proc summary data = hf02; 
var ret temp_backed_ret_aic;
output out = v_hf02_ret_mean
mean = std = /autoname;
by fundid_mer; 
run;

data hf02;
merge hf02 v_hf02_ret_mean;
by fundid_mer; 
backed_ret_aic = temp_backed_ret_aic + ret_mean - temp_backed_ret_aic_mean;
drop _TYPE_ _FREQ_;
run;




*Aggregate;
proc sort data = hf02; by aut_rank categ yyyymm; run;
proc summary data = hf02; 
var ret backed_ret_aic;
output out = hf2_ewret mean = ret_ew backed_ret_aic_ew;
by aut_rank categ yyyymm; 
run;

proc sort data = hf2_ewret; by categ yyyymm; run;

data hf2_ewret;
set hf2_ewret;
funds_categ_mm = _FREQ_;
lag1_ret_ew = lag1(ret_ew);
if lag(categ) ne categ then lag1_ret_ew = . ;
lag2_ret_ew = lag2(ret_ew);
if lag2(categ) ne categ then lag2_ret_ew = . ;

lag1_backed_ret_aic_ew = lag1(backed_ret_aic_ew);
if lag(categ) ne categ then lag1_backed_ret_aic_ew = . ;
lag2_backed_ret_aic_ew = lag2(backed_ret_aic_ew);
if lag2(categ) ne categ then lag2_backed_ret_aic_ew = . ;

drop _TYPE_ _FREQ_;
run;

/****************************************************************************************************************/

**** 3-step method, First step: Obtain aggregate unsmoothed returns;

data s1ag00;
set hf2_ewret;
keep aut_rank categ yyyymm ret_ew funds_categ_mm;
run;

*demean;
proc sort data = s1ag00; by categ; run;
proc summary data = s1ag00; 
var ret_ew funds_categ_mm;
output out = w_mean_ew_ret mean = av_aggrret_categ av_funds_categ_mm;
by categ; run;

data s1ag00;
merge s1ag00 w_mean_ew_ret;
by categ;
dem_catret_ew = ret_ew - av_aggrret_categ;
drop _TYPE_ _FREQ_ funds_categ_mm av_funds_categ_mm;
run;

proc summary data = s1ag00;
var dem_catret_ew;
output out = check_avg_dem_catret_ew mean = /autoname;
by categ; 
run;

*category sequence #;
proc sort data = s1ag00; by categ yyyymm; run;
data s1ag00;
set s1ag00;
by categ;
if first.categ then categ_seq = 1;
else categ_seq + 1;
run; 

*Apply MA unsmothing;
*MA with 3 lags;
proc sort data = s1ag00; by categ yyyymm; run;
proc arima data= s1ag00 /*out = arima_ma2_est_out*/;
identify var = dem_catret_ew noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= ag_arima_ma3_est OUTSTAT=ag_arima_ma3_diag noprint;
estimate q= 2 noint ma = -0.2 -0.2 method=ml OUTEST= ag_arima_ma2_est OUTSTAT=ag_arima_ma2_diag noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= ag_arima_ma1_est OUTSTAT=ag_arima_ma1_diag noprint;
estimate q= 0 noint method=ml OUTEST= ag_arima_ma0_est OUTSTAT=ag_arima_ma0_diag noprint;
*forecast noprint;
by categ;
run;
quit;

*Compare AIC from the 4 possible models;

data ag_arima_ma0_diag; set ag_arima_ma0_diag; ma0_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data ag_arima_ma1_diag; set ag_arima_ma1_diag; ma1_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data ag_arima_ma2_diag; set ag_arima_ma2_diag; ma2_value = _VALUE_; drop _TYPE_ _VALUE_; run;
data ag_arima_ma3_diag; set ag_arima_ma3_diag; ma3_value = _VALUE_; drop _TYPE_ _VALUE_; run;

data s1ag00_diag;
merge ag_arima_ma0_diag ag_arima_ma1_diag ag_arima_ma2_diag ag_arima_ma3_diag;
by categ;
if _STAT_ = "AIC" or _STAT_ = "CONV";
run;


*Estimated PAIs for case with 1, 2 and 3 MA lags;
data ag_arima_ma3_est;
set ag_arima_ma3_est;
if _TYPE_ = "EST";
ma3_STATUS_ = _STATUS_;
ma3_pai_sum = 1 - ma1_1 - ma1_2- ma1_3;
ma3_pai_0 = 1/ma3_pai_sum;
ma3_pai_1 = -ma1_1/ma3_pai_sum;
ma3_pai_2 = -ma1_2/ma3_pai_sum;
ma3_pai_3 = -ma1_3/ma3_pai_sum;
ma3_pai_sum_norm = ma3_pai_0 + ma3_pai_1 + ma3_pai_2 + ma3_pai_3;
keep categ ma3_STATUS_ ma3_pai_0 ma3_pai_1 ma3_pai_2 ma3_pai_3 ma3_pai_sum;
run;

data ag_arima_ma2_est;
set ag_arima_ma2_est;
if _TYPE_ = "EST";
ma2_STATUS_ = _STATUS_;
ma2_pai_sum = 1 - ma1_1 - ma1_2;
ma2_pai_0 = 1/ma2_pai_sum;
ma2_pai_1 = -ma1_1/ma2_pai_sum;
ma2_pai_2 = -ma1_2/ma2_pai_sum;
ma2_pai_sum_norm = ma2_pai_0 + ma2_pai_1 + ma2_pai_2;
keep categ ma2_STATUS_ ma2_pai_0 ma2_pai_1 ma2_pai_2 ma2_pai_sum;
run;

data ag_arima_ma1_est;
set ag_arima_ma1_est;
if _TYPE_ = "EST";
ma1_STATUS_ = _STATUS_;
ma1_pai_sum = 1 - ma1_1;
ma1_pai_0 = 1/ma1_pai_sum;
ma1_pai_1 = -ma1_1/ma1_pai_sum;
ma1_pai_sum_norm = ma1_pai_0 + ma1_pai_1;
keep categ ma1_STATUS_ ma1_pai_0 ma1_pai_1 ma1_pai_sum;
run;

*Attach coefficients to main dataset;
data s1ag01;
merge s1ag00 ag_arima_ma1_est ag_arima_ma2_est ag_arima_ma3_est;
by categ;
run;


*AIC Estimate of best fit model;
data s1ag00_diag_aic; 
set s1ag00_diag;
if _STAT_ = "AIC";
ma0_ag_aic = ma0_value;
ma1_ag_aic = ma1_value;
ma2_ag_aic = ma2_value;
ma3_ag_aic = ma3_value;
keep categ _STAT_ ma0_ag_aic ma1_ag_aic ma2_ag_aic ma3_ag_aic;
run;

proc sort data = s1ag00_diag_aic; by categ; run;
proc sort data = s1ag01; by categ; run;

data s1ag01; 
merge s1ag01 s1ag00_diag_aic; 
by categ; 
run;

*Convergence status;
data s1ag00_diag_conv; 
set s1ag00_diag;
if _STAT_ = "CONV";
ma0_ag_conv = ma0_value;
ma1_ag_conv = ma1_value;
ma2_ag_conv = ma2_value;
ma3_ag_conv = ma3_value;
keep categ ma0_ag_conv ma1_ag_conv ma2_ag_conv ma3_ag_conv;
run;
proc sort data = s1ag00_diag_conv; by categ; run;
proc sort data = s1ag01; by categ; run;

data s1ag01; 
merge s1ag01 s1ag00_diag_conv; 
by categ; 
run;

*Use ARIMA statistics to determine best model;
*In case of non-convergence, set to MA(0) (i.e., do not unsmooth, but do not drop fund to maintain same sample across methods);
data s1ag01;
set s1ag01;
ma1_gt_1_5 = 0; 
if ma1_pai_0 gt 1.25 or ma1_pai_1 gt 1.25 then ma1_gt_1_5 = 1;
if ma1_pai_0 le -0.45 or ma1_pai_1 le -0.45 then ma1_gt_1_5 = 1;
ma2_gt_1_5 = 0; 
if ma2_pai_0 gt 1.25 or ma2_pai_1 gt 1.25 or ma2_pai_2 gt 1.25 then ma2_gt_1_5 = 1;
if ma2_pai_0 le -0.45 or ma2_pai_1 le -0.45 or ma2_pai_2 le -0.45 then ma2_gt_1_5 = 1;
ma3_gt_1_5 = 0; 
if ma3_pai_0 gt 1.25 or ma3_pai_1 gt 1.25 or ma3_pai_2 gt 1.25 or ma3_pai_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_pai_0 le -0.45 or ma3_pai_1 le -0.45 or ma3_pai_2 le -0.45 or ma3_pai_3 le -0.45 then ma3_gt_1_5 = 1;
ma0_ag_aic_adj1 = ma0_ag_aic;
ma1_ag_aic_adj1 = ma1_ag_aic; if ma1_gt_1_5 = 1 then ma1_ag_aic_adj1 = 99999;
ma2_ag_aic_adj1 = ma2_ag_aic; if ma2_gt_1_5 = 1 then ma2_ag_aic_adj1 = 99999;
ma3_ag_aic_adj1 = ma3_ag_aic; if ma3_gt_1_5 = 1 then ma3_ag_aic_adj1 = 99999;
if ma1_ag_conv ge 1 then ma1_ag_aic_adj1 = 99999;
if ma2_ag_conv ge 1 then ma2_ag_aic_adj1 = 99999;
if ma3_ag_conv ge 1 then ma3_ag_aic_adj1 = 99999;
ag_aic_adj1_win = 0;
if ma1_ag_aic_adj1 = min(ma0_ag_aic_adj1,ma1_ag_aic_adj1,ma2_ag_aic_adj1,ma3_ag_aic_adj1) then ag_aic_adj1_win = 1;
if ma2_ag_aic_adj1 = min(ma0_ag_aic_adj1,ma1_ag_aic_adj1,ma2_ag_aic_adj1,ma3_ag_aic_adj1) then ag_aic_adj1_win = 2;
if ma3_ag_aic_adj1 = min(ma0_ag_aic_adj1,ma1_ag_aic_adj1,ma2_ag_aic_adj1,ma3_ag_aic_adj1) then ag_aic_adj1_win = 3;
aic_pai_0 = 1; aic_pai_1 = 0; aic_pai_2 = 0; aic_pai_3 = 0;
if ag_aic_adj1_win = 1 then aic_pai_0 = ma1_pai_0;
if ag_aic_adj1_win = 1 then aic_pai_1 = ma1_pai_1;
if ag_aic_adj1_win = 2 then aic_pai_0 = ma2_pai_0;
if ag_aic_adj1_win = 2 then aic_pai_1 = ma2_pai_1;
if ag_aic_adj1_win = 2 then aic_pai_2 = ma2_pai_2;
if ag_aic_adj1_win = 3 then aic_pai_0 = ma3_pai_0;
if ag_aic_adj1_win = 3 then aic_pai_1 = ma3_pai_1;
if ag_aic_adj1_win = 3 then aic_pai_2 = ma3_pai_2;
if ag_aic_adj1_win = 3 then aic_pai_3 = ma3_pai_3;
run;

***Aggregate MA coefficients (pais);
data s1_ag_pais; set s1ag01; keep aut_rank categ ag_aic_adj1_win aic_pai_0 aic_pai_1 aic_pai_2 aic_pai_3; run;
proc sort data = s1_ag_pais nodupkey; by aut_rank categ; run;


*back out unsmoothed returns for aggregate returns;
proc sort data = s1ag01; by categ yyyymm; run;



data s1ag01_loop;
set s1ag01;
backed_catret_aic = dem_catret_ew;
keep aut_rank categ categ_seq backed_catret_aic dem_catret_ew yyyymm av_aggrret_categ
aic_pai_0 aic_pai_1 aic_pai_2 aic_pai_3;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;

%do i = 4 %to 276; *AIC;

data s1ag01_loop;
set s1ag01_loop;
lag1_backed_catret_aic = lag1(backed_catret_aic);
lag2_backed_catret_aic = lag2(backed_catret_aic);
lag3_backed_catret_aic = lag3(backed_catret_aic);

if categ_seq = &i then backed_catret_aic = (dem_catret_ew - aic_pai_1*lag1_backed_catret_aic - aic_pai_2*lag2_backed_catret_aic - aic_pai_3*lag3_backed_catret_aic)/aic_pai_0;
run;

%end;



*Average backed ret by category;
proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop mean = /autoname;
by categ;
run;


data s1ag01_loop; 
merge s1ag01_loop check_av_s1ag01_loop;
by categ; 
temp_backed_catret_aic = backed_catret_aic;
drop backed_catret_aic;
run;

data s1ag01_loop; 
set s1ag01_loop; 
backed_catret_aic = temp_backed_catret_aic - backed_catret_aic_mean;
drop backed_catret_aic_mean temp_backed_catret_aic _FREQ_ _TYPE_;
run;

proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop_v2 mean = /autoname;
by categ;
run;

*Clean dataset and add back mean;
data s1ag01_loop;
set s1ag01_loop;
dem_backed_catret_aic = backed_catret_aic;
backed_catret_aic = backed_catret_aic + av_aggrret_categ;
keep categ yyyymm backed_catret_aic dem_backed_catret_aic;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;
proc sort data = s1ag01; by categ yyyymm; run;

data s1ag02;
merge s1ag01 s1ag01_loop; 
by categ yyyymm; 
run;


*****************************************;
*Continue to step 2;

proc sort data = s1ag02; by categ; run;
proc corr data = s1ag02 noprint out= corr_catret_backed; 
by categ; 
var ret_ew backed_catret_aic;
run;

data corr_catret_backed; 
set corr_catret_backed; 
if _TYPE_ = "MEAN" or _TYPE_ = "STD" or _NAME_ = "ret_ew"; 
est_type = _TYPE_; 
drop _TYPE_; 
run;

data corr_catret_backed; 
set corr_catret_backed; 
change_aic = backed_catret_aic/ret_ew - 1;
run;

proc sort data = corr_catret_backed; by descending est_type descending change_aic; run;


*SECOND STEP: GET FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);
*Need to break up the process into 4 parts;


data s2fund00; *fund-level data;
set hf00;
keep ret yyyymm fundid_mer categ fund_seq av_ret_fundid lag_assets_fill aut_rank;
run;

data w_au_ewcatert; *keep unsmoothed EW series;
set s1ag02;
catret_ew = ret_ew;
ag_aic_adj1_win = ag_aic_adj1_win;
keep categ yyyymm catret_ew backed_catret_aic ag_aic_adj1_win;
run;

proc sort data = w_au_ewcatert; by categ yyyymm; run;
data w_au_ewcatert; *lags to be used as covariates;
set w_au_ewcatert;
*1 lag;
lag1_backed_catret_aic = lag1(backed_catret_aic);
if categ ne lag1(categ) then lag1_backed_catret_aic = . ;
*2 lag;
lag2_backed_catret_aic = lag2(backed_catret_aic);
if categ ne lag2(categ) then lag2_backed_catret_aic = . ;
*3 lag;
lag3_backed_catret_aic = lag3(backed_catret_aic);
if categ ne lag3(categ) then lag3_backed_catret_aic = . ;
run;

*attach category-month returns and demean;
proc sort data = s2fund00; by categ yyyymm; run;
proc sort data = w_au_ewcatert; by categ yyyymm; run;

data s2fund00;
merge s2fund00 w_au_ewcatert; 
by categ yyyymm; 
ret_excat = ret - catret_ew;
run;

*Obtain funds average return in excess of the category return;
*also demean unsmoothed EW categ return;
proc sort data = s2fund00; by fundid_mer; run;
proc summary data = s2fund00; 
var ret_excat backed_catret_aic
lag1_backed_catret_aic
lag2_backed_catret_aic
lag3_backed_catret_aic;
output out = w_mean_excat_ret 
mean = av_retexcat_fundid av_backed_catret_aic_fund
av_lag1_backed_catret_aic_fund
av_lag2_backed_catret_aic_fund
av_lag3_backed_catret_aic_fund;
by fundid_mer; 
run;

data s2fund00;
merge s2fund00 w_mean_excat_ret;
by fundid_mer; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_catret_aic = backed_catret_aic - av_backed_catret_aic_fund;
dem_lag1_backed_catret_aic = lag1_backed_catret_aic - av_lag1_backed_catret_aic_fund;
dem_lag2_backed_catret_aic = lag2_backed_catret_aic - av_lag2_backed_catret_aic_fund;
dem_lag3_backed_catret_aic = lag3_backed_catret_aic - av_lag3_backed_catret_aic_fund;
run;

*check average dem_ret_excat;
proc summary data = s2fund00;
var dem_ret_excat dem_backed_catret_aic dem_lag1_backed_catret_aic;
output out = check_avg_dem_ret_excat mean = /autoname;
by fundid_mer; 
run;



*MA with 3 lags;
*Control for aggregate lag returns as covariates based on the # of lags indicated by the previous step;
*Split into 4 parts;
*First part: funds with aggregate AIC K = 0;
data v_s2fund00_cataic0;
set s2fund00;
if ag_aic_adj1_win = 0;
run;
proc sort data = v_s2fund00_cataic0; by ag_aic_adj1_win fundid_mer yyyymm; run;
proc arima data= v_s2fund00_cataic0;
identify var = dem_ret_excat noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est_cataic0 OUTSTAT= excat_arima_ma3_diag_cataic0 noprint;
estimate q= 2 noint ma = -0.2 -0.2 method=ml OUTEST= excat_arima_ma2_est_cataic0 OUTSTAT= excat_arima_ma2_diag_cataic0 noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= excat_arima_ma1_est_cataic0 OUTSTAT= excat_arima_ma1_diag_cataic0 noprint;
estimate q= 0 noint method=ml OUTEST= excat_arima_ma0_est_cataic0 OUTSTAT= excat_arima_ma0_diag_cataic0 noprint;
*forecast noprint;
by ag_aic_adj1_win fundid_mer;
run;
quit;

*Second part: funds with aggregate AIC K = 1;
data v_s2fund00_cataic1;
set s2fund00;
if ag_aic_adj1_win = 1;
run;
proc sort data = v_s2fund00_cataic1; by ag_aic_adj1_win fundid_mer yyyymm; run;
proc arima data= v_s2fund00_cataic1;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est_cataic1 OUTSTAT= excat_arima_ma3_diag_cataic1 noprint;
estimate q= 2 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic) noint ma = -0.2 -0.2 method=ml OUTEST= excat_arima_ma2_est_cataic1 OUTSTAT= excat_arima_ma2_diag_cataic1 noprint;
estimate q= 1 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic) noint ma = -0.2 method=ml OUTEST= excat_arima_ma1_est_cataic1 OUTSTAT= excat_arima_ma1_diag_cataic1 noprint;
estimate q= 0 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic) noint method=ml OUTEST= excat_arima_ma0_est_cataic1 OUTSTAT= excat_arima_ma0_diag_cataic1 noprint;
*forecast noprint;
by ag_aic_adj1_win fundid_mer;
run;
quit;

*Third part: funds with aggregate AIC K = 2;
data v_s2fund00_cataic2;
set s2fund00;
if ag_aic_adj1_win = 2;
run;
proc sort data = v_s2fund00_cataic2; by ag_aic_adj1_win fundid_mer yyyymm; run;
proc arima data= v_s2fund00_cataic2;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est_cataic2 OUTSTAT= excat_arima_ma3_diag_cataic2 noprint;
estimate q= 2 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic) noint ma = -0.2 -0.2 method=ml OUTEST= excat_arima_ma2_est_cataic2 OUTSTAT= excat_arima_ma2_diag_cataic2 noprint;
estimate q= 1 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic) noint ma = -0.2 method=ml OUTEST= excat_arima_ma1_est_cataic2 OUTSTAT= excat_arima_ma1_diag_cataic2 noprint;
estimate q= 0 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic) noint method=ml OUTEST= excat_arima_ma0_est_cataic2 OUTSTAT= excat_arima_ma0_diag_cataic2 noprint;
*forecast noprint;
by ag_aic_adj1_win fundid_mer;
run;
quit;

*Fourth part: funds with aggregate AIC K = 3;
data v_s2fund00_cataic3;
set s2fund00;
if ag_aic_adj1_win = 3;
run;
proc sort data = v_s2fund00_cataic3; by ag_aic_adj1_win fundid_mer yyyymm; run;
proc arima data= v_s2fund00_cataic3;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est_cataic3 OUTSTAT= excat_arima_ma3_diag_cataic3 noprint;
estimate q= 2 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 method=ml OUTEST= excat_arima_ma2_est_cataic3 OUTSTAT= excat_arima_ma2_diag_cataic3 noprint;
estimate q= 1 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 method=ml OUTEST= excat_arima_ma1_est_cataic3 OUTSTAT= excat_arima_ma1_diag_cataic3 noprint;
estimate q= 0 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint method=ml OUTEST= excat_arima_ma0_est_cataic3 OUTSTAT= excat_arima_ma0_diag_cataic3 noprint;
*forecast noprint;
by ag_aic_adj1_win fundid_mer;
run;
quit;

*Compare AIC and convergence status from the 4 possible models (from the 4 parts);
*clean;
data excat_arima_ma0_diag_cataic0; set excat_arima_ma0_diag_cataic0; ma0_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma1_diag_cataic0; set excat_arima_ma1_diag_cataic0; ma1_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma2_diag_cataic0; set excat_arima_ma2_diag_cataic0; ma2_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma3_diag_cataic0; set excat_arima_ma3_diag_cataic0; ma3_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;

data excat_arima_ma0_diag_cataic1; set excat_arima_ma0_diag_cataic1; ma0_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma1_diag_cataic1; set excat_arima_ma1_diag_cataic1; ma1_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma2_diag_cataic1; set excat_arima_ma2_diag_cataic1; ma2_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma3_diag_cataic1; set excat_arima_ma3_diag_cataic1; ma3_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;

data excat_arima_ma0_diag_cataic2; set excat_arima_ma0_diag_cataic2; ma0_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma1_diag_cataic2; set excat_arima_ma1_diag_cataic2; ma1_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma2_diag_cataic2; set excat_arima_ma2_diag_cataic2; ma2_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma3_diag_cataic2; set excat_arima_ma3_diag_cataic2; ma3_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;

data excat_arima_ma0_diag_cataic3; set excat_arima_ma0_diag_cataic3; ma0_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma1_diag_cataic3; set excat_arima_ma1_diag_cataic3; ma1_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma2_diag_cataic3; set excat_arima_ma2_diag_cataic3; ma2_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data excat_arima_ma3_diag_cataic3; set excat_arima_ma3_diag_cataic3; ma3_value = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;

proc sort data = excat_arima_ma0_diag_cataic0; by fundid_mer; run;
proc sort data = excat_arima_ma1_diag_cataic0; by fundid_mer; run;
proc sort data = excat_arima_ma2_diag_cataic0; by fundid_mer; run;
proc sort data = excat_arima_ma3_diag_cataic0; by fundid_mer; run;

proc sort data = excat_arima_ma0_diag_cataic1; by fundid_mer; run;
proc sort data = excat_arima_ma1_diag_cataic1; by fundid_mer; run;
proc sort data = excat_arima_ma2_diag_cataic1; by fundid_mer; run;
proc sort data = excat_arima_ma3_diag_cataic1; by fundid_mer; run;

proc sort data = excat_arima_ma0_diag_cataic2; by fundid_mer; run;
proc sort data = excat_arima_ma1_diag_cataic2; by fundid_mer; run;
proc sort data = excat_arima_ma2_diag_cataic2; by fundid_mer; run;
proc sort data = excat_arima_ma3_diag_cataic2; by fundid_mer; run;

proc sort data = excat_arima_ma0_diag_cataic3; by fundid_mer; run;
proc sort data = excat_arima_ma1_diag_cataic3; by fundid_mer; run;
proc sort data = excat_arima_ma2_diag_cataic3; by fundid_mer; run;
proc sort data = excat_arima_ma3_diag_cataic3; by fundid_mer; run;

*merge together;
data s2fund00_diag; set s2fund00; keep fundid_mer categ; run;
proc sort data = s2fund00_diag nodupkey; by fundid_mer; run;


*(This step is different from doing this in the full sample since this is a rolling window);
data v_diag_cataic0_au;
merge excat_arima_ma0_diag_cataic0 excat_arima_ma1_diag_cataic0 excat_arima_ma2_diag_cataic0 excat_arima_ma3_diag_cataic0;
diag_rol_yyyy_end = &yearto;
run;

data v_diag_cataic1_au;
merge excat_arima_ma0_diag_cataic1 excat_arima_ma1_diag_cataic1 excat_arima_ma2_diag_cataic1 excat_arima_ma3_diag_cataic1;
diag_rol_yyyy_end = &yearto;
run;

data v_diag_cataic2_au;
merge excat_arima_ma0_diag_cataic2 excat_arima_ma1_diag_cataic2 excat_arima_ma2_diag_cataic2 excat_arima_ma3_diag_cataic2;
diag_rol_yyyy_end = &yearto;
run;

data v_diag_cataic3_au;
merge excat_arima_ma0_diag_cataic3 excat_arima_ma1_diag_cataic3 excat_arima_ma2_diag_cataic3 excat_arima_ma3_diag_cataic3;
diag_rol_yyyy_end = &yearto;
run;

data s2fund00_diag_aic;
set v_diag_cataic0_au v_diag_cataic1_au v_diag_cataic2_au v_diag_cataic3_au;
run;


proc sort data = s2fund00_diag_aic; by fundid_mer; run;

*Save temporary dataset by rolling window year;
data w_saves2fund00_diag_aic_&yearto; set s2fund00_diag_aic; run;


*Estimated PHIs for case with 1, 2 and 3 MA lags;
*MA3;
data excat_arima_ma3_est;
set excat_arima_ma3_est_cataic3 excat_arima_ma3_est_cataic2 excat_arima_ma3_est_cataic1 excat_arima_ma3_est_cataic0;
if _TYPE_ = "EST";
ma3_STATUS_ = _STATUS_;
ma3_phi_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_phi_0 = 1/ma3_phi_sum;
ma3_phi_1 = -ma1_1/ma3_phi_sum;
ma3_phi_2 = -ma1_2/ma3_phi_sum;
ma3_phi_3 = -ma1_3/ma3_phi_sum;
ma3_phi_sum_norm = ma3_phi_0 + ma3_phi_1 + ma3_phi_2 + ma3_phi_3;
keep fundid_mer ma3_STATUS_ ma3_phi_0 ma3_phi_1 ma3_phi_2 ma3_phi_3 ma3_phi_sum;
run;
*MA2;
data excat_arima_ma2_est;
set excat_arima_ma2_est_cataic3 excat_arima_ma2_est_cataic2 excat_arima_ma2_est_cataic1 excat_arima_ma2_est_cataic0;
if _TYPE_ = "EST";
ma2_STATUS_ = _STATUS_;
ma2_phi_sum = 1 - ma1_1 - ma1_2;
ma2_phi_0 = 1/ma2_phi_sum;
ma2_phi_1 = -ma1_1/ma2_phi_sum;
ma2_phi_2 = -ma1_2/ma2_phi_sum;
ma2_phi_sum_norm = ma2_phi_0 + ma2_phi_1 + ma2_phi_2;
keep fundid_mer ma2_STATUS_ ma2_phi_0 ma2_phi_1 ma2_phi_2 ma2_phi_sum;
run;
*MA1;
data excat_arima_ma1_est;
set excat_arima_ma1_est_cataic3 excat_arima_ma1_est_cataic2 excat_arima_ma1_est_cataic1 excat_arima_ma1_est_cataic0;
if _TYPE_ = "EST";
ma1_STATUS_ = _STATUS_;
ma1_phi_sum = 1 - ma1_1;
ma1_phi_0 = 1/ma1_phi_sum;
ma1_phi_1 = -ma1_1/ma1_phi_sum;
ma1_phi_sum_norm = ma1_phi_0 + ma1_phi_1;
keep fundid_mer ma1_STATUS_ ma1_phi_0 ma1_phi_1 ma1_phi_sum;
run;
*MA0;
data excat_arima_ma0_est;
set excat_arima_ma0_est_cataic3 excat_arima_ma0_est_cataic2 excat_arima_ma0_est_cataic1 excat_arima_ma0_est_cataic0;
if _TYPE_ = "EST";
ma0_STATUS_ = _STATUS_;
keep fundid_mer ma0_STATUS_;
run;

*attach thetas to main dataset;
proc sort data = s2fund00; by fundid_mer; run;
proc sort data = excat_arima_ma0_est; by fundid_mer; run;
proc sort data = excat_arima_ma1_est; by fundid_mer; run;
proc sort data = excat_arima_ma2_est; by fundid_mer; run;
proc sort data = excat_arima_ma3_est; by fundid_mer; run;


data s2fund01;
merge s2fund00 excat_arima_ma0_est excat_arima_ma1_est excat_arima_ma2_est excat_arima_ma3_est;
by fundid_mer;
est_rol_yyyy_end = &yearto;
run;

*ADD AIC Estimate of best fit model;
proc sort data = s2fund00_diag_aic; by fundid_mer; run;
proc sort data = s2fund01; by fundid_mer; run;

data s2fund01; 
merge s2fund01 s2fund00_diag_aic;
by fundid_mer; 
drop _TYPE_ _FREQ_;
run;


***Now use ARIMA statistics to determine best model;
data s2fund01;
set s2fund01;
ma0_res_aic = ma0_value;
ma1_res_aic = ma1_value;
ma2_res_aic = ma2_value;
ma3_res_aic = ma3_value;
ma1_res_conv = 1; if ma1_STATUS_ = "0 Converged" then ma1_res_conv = 0;
ma2_res_conv = 1; if ma2_STATUS_ = "0 Converged" then ma2_res_conv = 0;
ma3_res_conv = 1; if ma3_STATUS_ = "0 Converged" then ma3_res_conv = 0;
ma1_gt_1_5 = 0; 
if ma1_phi_0 gt 1.25 or ma1_phi_1 gt 1.25 then ma1_gt_1_5 = 1;
if ma1_phi_0 le -0.45 or ma1_phi_1 le -0.45 then ma1_gt_1_5 = 1;
ma2_gt_1_5 = 0; 
if ma2_phi_0 gt 1.25 or ma2_phi_1 gt 1.25 or ma2_phi_2 gt 1.25 then ma2_gt_1_5 = 1;
if ma2_phi_0 le -0.45 or ma2_phi_1 le -0.45 or ma2_phi_2 le -0.45 then ma2_gt_1_5 = 1;
ma3_gt_1_5 = 0; 
if ma3_phi_0 gt 1.25 or ma3_phi_1 gt 1.25 or ma3_phi_2 gt 1.25 or ma3_phi_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_phi_0 le -0.45 or ma3_phi_1 le -0.45 or ma3_phi_2 le -0.45 or ma3_phi_3 le -0.45 then ma3_gt_1_5 = 1;
ma0_res_aic_adj1 = ma0_res_aic;
ma1_res_aic_adj1 = ma1_res_aic; if ma1_gt_1_5 = 1 then ma1_res_aic_adj1 = 99999;
ma2_res_aic_adj1 = ma2_res_aic; if ma2_gt_1_5 = 1 then ma2_res_aic_adj1 = 99999;
ma3_res_aic_adj1 = ma3_res_aic; if ma3_gt_1_5 = 1 then ma3_res_aic_adj1 = 99999;
if ma1_res_conv ge 1 then ma1_res_aic_adj1 = 99999;
if ma2_res_conv ge 1 then ma2_res_aic_adj1 = 99999;
if ma3_res_conv ge 1 then ma3_res_aic_adj1 = 99999;
res_aic_adj1_win = 0;
if ma1_res_aic_adj1 = min(ma0_res_aic_adj1,ma1_res_aic_adj1,ma2_res_aic_adj1,ma3_res_aic_adj1) then res_aic_adj1_win = 1;
if ma2_res_aic_adj1 = min(ma0_res_aic_adj1,ma1_res_aic_adj1,ma2_res_aic_adj1,ma3_res_aic_adj1) then res_aic_adj1_win = 2;
if ma3_res_aic_adj1 = min(ma0_res_aic_adj1,ma1_res_aic_adj1,ma2_res_aic_adj1,ma3_res_aic_adj1) then res_aic_adj1_win = 3;
aic_phi_0 = 1; aic_phi_1 = 0; aic_phi_2 = 0; aic_phi_3 = 0;
if res_aic_adj1_win = 1 then aic_phi_0 = ma1_phi_0;
if res_aic_adj1_win = 1 then aic_phi_1 = ma1_phi_1;
if res_aic_adj1_win = 2 then aic_phi_0 = ma2_phi_0;
if res_aic_adj1_win = 2 then aic_phi_1 = ma2_phi_1;
if res_aic_adj1_win = 2 then aic_phi_2 = ma2_phi_2;
if res_aic_adj1_win = 3 then aic_phi_0 = ma3_phi_0;
if res_aic_adj1_win = 3 then aic_phi_1 = ma3_phi_1;
if res_aic_adj1_win = 3 then aic_phi_2 = ma3_phi_2;
if res_aic_adj1_win = 3 then aic_phi_3 = ma3_phi_3;
run;

*Save temporary dataset by rolling window year;
data w_saves2fund01_&yearto; set s2fund01; run;

*Aggregate MA coefficients (pais);
data w_s2_res_phis; set s2fund01; keep aut_rank categ fundid_mer res_aic_adj1_win aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3; run;
proc sort data = w_s2_res_phis nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = w_s2_res_phis; 
var res_aic_adj1_win aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3;
output out = s2_res_phis
mean = res_aic_adj1_win aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3;
by aut_rank categ; 
run;

*check outliers min max;
proc summary data = w_s2_res_phis; 
var aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3;
output out = s2_res_phis_minmax
min = max = /autoname;
by aut_rank categ; 
run;

*check outliers percentiles;
proc summary data = w_s2_res_phis; 
var aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3;
output out = s2_res_phis_p1p99
p1 = p99 = /autoname;
by aut_rank categ; 
run;

data w_tab_ma_coeff_all_&yearto;
merge v_av_thetas s1_ag_pais s2_res_phis;
by aut_rank categ; 
drop _TYPE_ _FREQ_;
run;


*back out unsmoothed excess returns;
proc sort data = s2fund01; by fundid_mer yyyymm; run;



data s2fund01_loop;
set s2fund01;
backed_ret_excat_aic = dem_ret_excat;
keep backed_ret_excat_aic dem_ret_excat fundid_mer yyyymm fund_seq av_ret_fundid
aic_phi_0 aic_phi_1 aic_phi_2 aic_phi_3 av_retexcat_fundid;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *AIC;

data s2fund01_loop;
set s2fund01_loop;
lag1_backed_ret_excat_aic = lag1(backed_ret_excat_aic);
lag2_backed_ret_excat_aic = lag2(backed_ret_excat_aic);
lag3_backed_ret_excat_aic = lag3(backed_ret_excat_aic);

if fund_seq = &i then backed_ret_excat_aic = (dem_ret_excat - aic_phi_1*lag1_backed_ret_excat_aic - aic_phi_2*lag2_backed_ret_excat_aic - aic_phi_3*lag3_backed_ret_excat_aic)/aic_phi_0;
run;

%end;



*clean dataset and keep adjusted residual;
data s2fund01_loop;
set s2fund01_loop;
res_backed_ret_excat_aic = backed_ret_excat_aic;
keep fundid_mer yyyymm res_backed_ret_excat_aic;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;
proc sort data = s2fund01; by fundid_mer yyyymm; run;

data s2fund02;
merge s2fund01 s2fund01_loop; 
by fundid_mer yyyymm; 
run;


* STEP 3 : Add up residual from aggregate MA and excess ret MA;
*Aggregate residual;
data s1_resid;
set s1ag02;
keep categ yyyymm av_aggrret_categ 
backed_catret_aic dem_backed_catret_aic;
run;

*Fund-level cat-excess return residual;
data s2_resid;
set s2fund02;
keep fundid_mer yyyymm dem_ret_excat res_backed_ret_excat_aic;
run;

*attach to main dataset;
proc sort data = hf02; by categ yyyymm; run;
proc sort data = s1_resid; by categ yyyymm; run;

data s3_00;
merge hf02 s1_resid; 
by categ yyyymm; 
run;

proc sort data = s3_00; by fundid_mer yyyymm; run;
proc sort data = s2_resid; by fundid_mer yyyymm; run;

data s3_00;
merge s3_00 s2_resid; 
by fundid_mer yyyymm;
run;

*Calculate unsmoothed return;
data s3_00;
set s3_00;
s3_uns_ret_aic_temp = dem_backed_catret_aic + res_backed_ret_excat_aic + av_ret_fundid;
run;


proc sort data = s3_00; by fundid_mer; run;
proc summary data = s3_00; 
var s3_uns_ret_aic_temp av_ret_fundid;
output out = w_adj_s3_uns mean = /autoname;
by fundid_mer; 
run;

data s3_00;
merge s3_00 w_adj_s3_uns;
by fundid_mer; 
s3_uns_ret_aic = s3_uns_ret_aic_temp - s3_uns_ret_aic_temp_mean + av_ret_fundid_mean;
drop s3_uns_ret_aic_temp_mean av_ret_fundid_mean _TYPE_ _FREQ_;
run;


*averages;
proc sort data = s3_00; by categ; run;
proc summary data = s3_00;
var ret av_ret_fundid backed_ret_aic s3_uns_ret_aic 
dem_backed_catret_aic res_backed_ret_excat_aic;
output out = check_avret_s3_00_all mean = /autoname;
by categ;
run;


* Clean and save data;
data w_unsm_&yearto;
set s3_00;
glm_ret = backed_ret_aic;
s3_ret = s3_uns_ret_aic; 
keep ret glm_ret s3_ret yyyymm assets_fill bh_fund fundid_mer 
aut_rank categ lag_assets_fill fundid_mer_obs fund_seq aic_theta_0;
run;

*attach estimated smoothing parameters;

data w_keep_phis; set w_s2_res_phis; keep fundid_mer aic_phi_0; run;
data w_keep_pais; set s1_ag_pais; keep categ aic_pai_0; run;

proc sort data = w_unsm_&yearto; by fundid_mer; run;
proc sort data = w_keep_phis; by fundid_mer; run;
data w_unsm_&yearto;
merge w_unsm_&yearto w_keep_phis;
by fundid_mer; 
run;

proc sort data = w_unsm_&yearto; by categ; run;
proc sort data = w_keep_pais; by categ; run;
data w_unsm_&yearto;
merge w_unsm_&yearto w_keep_pais;
by categ; 
run;


/*******************************************************************/
*Drop datasets that may be rewritten each iteration to make sure they are not reused in the next iteration by mistake;

proc datasets lib=work nolist;
delete 
excat_arima_ma0_diag_cataic0 excat_arima_ma1_diag_cataic0 excat_arima_ma2_diag_cataic0 excat_arima_ma3_diag_cataic0
excat_arima_ma0_diag_cataic1 excat_arima_ma1_diag_cataic1 excat_arima_ma2_diag_cataic1 excat_arima_ma3_diag_cataic1
excat_arima_ma0_diag_cataic2 excat_arima_ma1_diag_cataic2 excat_arima_ma2_diag_cataic2 excat_arima_ma3_diag_cataic2
excat_arima_ma0_diag_cataic3 excat_arima_ma1_diag_cataic3 excat_arima_ma2_diag_cataic3 excat_arima_ma3_diag_cataic3
v_diag_cataic0_au v_diag_cataic1_au v_diag_cataic2_au v_diag_cataic3_au
excat_arima_ma0_est excat_arima_ma0_est_cataic0 excat_arima_ma0_est_cataic1 excat_arima_ma0_est_cataic2 excat_arima_ma0_est_cataic3
excat_arima_ma1_est excat_arima_ma1_est_cataic0 excat_arima_ma1_est_cataic1 excat_arima_ma1_est_cataic2 excat_arima_ma1_est_cataic3
excat_arima_ma2_est excat_arima_ma2_est_cataic0 excat_arima_ma2_est_cataic1 excat_arima_ma2_est_cataic2 excat_arima_ma2_est_cataic3
excat_arima_ma3_est excat_arima_ma3_est_cataic0 excat_arima_ma3_est_cataic1 excat_arima_ma3_est_cataic2 excat_arima_ma3_est_cataic3
S2fund00_diag S2fund00_diag_aic
;
quit;
run;


/********************************************************************************/
*Run FH Regressions;

*First add imputed add date to eliminate backfilled funds;
proc sort data = w_unsm_&yearto; by fundid_mer; run;
proc sort data = imputed_add_dates; by fundid_mer; run;

data w_unsm_&yearto; 
merge w_unsm_&yearto(in=a) imputed_add_dates;
by fundid_mer; 
if a;
run;

*attach factors;
proc sort data = w_unsm_&yearto; by yyyymm; run;
proc sort data = hffact; by yyyymm; run;

data w_unsm_&yearto; 
merge w_unsm_&yearto(in=a) hffact;
by yyyymm; 
if a;
run;

data w_unsm_&yearto;
set w_unsm_&yearto;
retrf = ret - rf;
glm_retrf = glm_ret - rf;
s3_retrf = s3_ret - rf;
run;

*Now select most recent 24 observations;
*Also require that add date is before regression;
data unsm_reg_select;
set w_unsm_&yearto;
rolreg_start_yyyymm = %eval((&yearto-1)*100 + 1);
rolreg_end_yyyymm = %eval(&yearto*100 + 12);
if imputed_add_date gt rolreg_end_yyyymm then delete;
if yyyymm ge rolreg_start_yyyymm and yyyymm le rolreg_end_yyyymm;
run;

proc sort data = unsm_reg_select; by fundid_mer yyyymm; run;

*Compute alphas and t-stats;
*Also estimate Dimson regressions;
proc reg data = unsm_reg_select outest = ww_unsm_reg_resu noprint tableout;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
by fundid_mer; 
run; quit;

*Model 1, unsmoothed alpha;
data ww_r0_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL1"; r0_alp = intercept; keep fundid_mer r0_alp; run;
*Model 1, unsmoothed alpha t-stat;
data ww_r0_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL1"; r0_t = intercept; keep fundid_mer r0_t; run;

*Model 2, GLM alpha;
data ww_r1_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL2"; r1_alp = intercept; keep fundid_mer r1_alp; run;
*Model 2, GLM alpha t-stat;
data ww_r1_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL2"; r1_t = intercept; keep fundid_mer r1_t; run;

*Model 3, 3STEP alpha - also keep n obs in regression;
data ww_r3_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL3";
reg_obs = _P_ + _EDF_;
r3_alp = intercept; keep fundid_mer r3_alp reg_obs; run;
*Model 3, 3STEP alpha t-stat;
data ww_r3_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL3"; r3_t = intercept; keep fundid_mer r3_t; run;

*Model 4, R0 + Dimson alpha;
data ww_r0d_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL4"; r0d_alp = intercept; keep fundid_mer r0d_alp; run;
*Model 4, R0 + Dimson alpha t-stat;
data ww_r0d_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL4"; r0d_t = intercept; keep fundid_mer r0d_t; run;

*Model 5, R1 + Dimson alpha;
data ww_r1d_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL5"; r1d_alp = intercept; keep fundid_mer r1d_alp; run;
*Model 5, R1 + Dimson alpha t-stat;
data ww_r1d_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL5"; r1d_t = intercept; keep fundid_mer r1d_t; run;

*Model 6, R3 + Dimson alpha;
data ww_r3d_alp; set ww_unsm_reg_resu; 
if _TYPE_ = "PARMS"; if _MODEL_ = "MODEL6";
r3d_alp = intercept; keep fundid_mer r3d_alp; run;
*Model 6, R3 + Dimson alpha t-stat;
data ww_r3d_t; set ww_unsm_reg_resu; 
if _TYPE_ = "T"; if _MODEL_ = "MODEL6"; r3d_t = intercept; keep fundid_mer r3d_t; run;


*set together the alphas and t-stats;
data unsm_reg_select_alpha;
merge unsm_reg_select 
ww_r0_alp ww_r0_t ww_r1_alp ww_r1_t ww_r3_alp ww_r3_t
ww_r0d_alp ww_r0d_t ww_r1d_alp ww_r1d_t ww_r3d_alp ww_r3d_t;
by fundid_mer;
run;

proc sort data = unsm_reg_select_alpha nodupkey; by fundid_mer; run;

data unsm_reg_select_alpha; *clean and keep if obs eq 24;
set unsm_reg_select_alpha;
if reg_obs = 24;
mdum = 1;
keep fundid_mer aut_rank categ aic_theta_0 aic_phi_0 aic_pai_0 
rolreg_end_yyyymm mdum reg_obs
r0_alp r0_t r1_alp r1_t r3_alp r3_t
r0d_alp r0d_t r1d_alp r1d_t r3d_alp r3d_t;
run;

*Constrained Dimson Alphas;
*Start by restricting to funds with 24 obs;

data w_funs_reg_list; set unsm_reg_select_alpha; keep fundid_mer reg_obs; run;

proc sort data = unsm_reg_select; by fundid_mer; run;
proc sort data = w_funs_reg_list; by fundid_mer; run;

data unsm_reg_select_cdim;
merge unsm_reg_select w_funs_reg_list;
by fundid_mer; 
if reg_obs = 24;
run;

*Run macro for Constrained Dimson;
*Do a grid search to ensure estimate does not depend on starting point;
*Multiple Omega starting points;
*Start: 0.01, 0.16, ... 0.91;
proc sort data = unsm_reg_select_cdim; by fundid_mer yyyymm; run;

%do multin=1 %to 91 %by 15;
%let multi = %sysevalf(0.01*&multin);

****Reported - R0;
%ODSOff
proc nlin data=unsm_reg_select_cdim outest = w_cont_dim_r0 SMETHOD=GOLDEN save;
ods output parameterestimates= w_cont_dim_r0_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1;
model retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8;
by fundid_mer;
run; quit;
%ODSOn

data w_cont_dim_r0_&multin; *clean;
set w_cont_dim_r0;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r0_b0_&multin = b0;
keep fundid_mer val_&multin STATUS_&multin SSE_&multin
r0_b0_&multin;
run;

data w_cont_dim_r0_est_clean; *clean;
set w_cont_dim_r0_est;
if parameter = "b0";
r0_b0_est_&multin = estimate;
r0_b0_t_&multin = tValue;
keep fundid_mer r0_b0_est_&multin r0_b0_t_&multin;
run;

*attach t-stat;
proc sort data = w_cont_dim_r0_&multin; by fundid_mer; run;
proc sort data = w_cont_dim_r0_est_clean; by fundid_mer; run;

data w_cont_dim_r0_&multin; 
merge w_cont_dim_r0_&multin(in=a) w_cont_dim_r0_est_clean;
by fundid_mer; 
if a;
run;


****1 Step - R1;
%ODSOff
proc nlin data=unsm_reg_select_cdim outest = w_cont_dim_r1 SMETHOD=GOLDEN save;
ods output parameterestimates= w_cont_dim_r1_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1;
model glm_retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8;
by fundid_mer;
run; quit;
%ODSOn

data w_cont_dim_r1_&multin; *clean;
set w_cont_dim_r1;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r1_b0_&multin = b0;
keep fundid_mer val_&multin STATUS_&multin SSE_&multin
r1_b0_&multin;
run;

data w_cont_dim_r1_est_clean; *clean;
set w_cont_dim_r1_est;
if parameter = "b0";
r1_b0_est_&multin = estimate;
r1_b0_t_&multin = tValue;
keep fundid_mer r1_b0_est_&multin r1_b0_t_&multin;
run;

*attach t-stat;
proc sort data = w_cont_dim_r1_&multin; by fundid_mer; run;
proc sort data = w_cont_dim_r1_est_clean; by fundid_mer; run;

data w_cont_dim_r1_&multin; 
merge w_cont_dim_r1_&multin(in=a) w_cont_dim_r1_est_clean;
by fundid_mer; 
if a;
run;


****3 Step - R3;
%ODSOff
proc nlin data=unsm_reg_select_cdim outest = w_cont_dim_r3 SMETHOD=GOLDEN save;
ods output parameterestimates= w_cont_dim_r3_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1;
model s3_retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8;
by fundid_mer;
run; quit;
%ODSOn

data w_cont_dim_r3_&multin; *clean;
set w_cont_dim_r3;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r3_b0_&multin = b0;
keep fundid_mer val_&multin STATUS_&multin SSE_&multin
r3_b0_&multin;
run;

data w_cont_dim_r3_est_clean; *clean;
set w_cont_dim_r3_est;
if parameter = "b0";
r3_b0_est_&multin = estimate;
r3_b0_t_&multin = tValue;
keep fundid_mer r3_b0_est_&multin r3_b0_t_&multin;
run;

*attach t-stat;
proc sort data = w_cont_dim_r3_&multin; by fundid_mer; run;
proc sort data = w_cont_dim_r3_est_clean; by fundid_mer; run;

data w_cont_dim_r3_&multin; 
merge w_cont_dim_r3_&multin(in=a) w_cont_dim_r3_est_clean;
by fundid_mer; 
if a;
run;

%end;



*Merge together;
*Merge R0 and find solution with minimum SSE;
proc sort data = w_funs_reg_list; by fundid_mer; run;
data r0_cdim_coeff;
merge w_funs_reg_list %do multin=1 %to 91 %by 15; w_cont_dim_r0_&multin %end; ;
by fundid_mer; 
run;
*find the minimum SSE;
data r0_cdim_coeff;
set r0_cdim_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
run;
*Selected parameters;
*Also save status and t-stat;
data r0_cdim_coeff;
set r0_cdim_coeff;
r0_min_sse_omeg_start = val_1;
r0_best_status = STATUS_1;
r0_best_b0 = r0_b0_est_1;
r0_best_b0_t = r0_b0_t_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r0_min_sse_omeg_start = val_&multin;
r0_best_status = STATUS_&multin;
r0_best_b0 = r0_b0_est_&multin;
r0_best_b0_t = r0_b0_t_&multin;
end;
%end;
keep fundid_mer r0_best_status r0_best_b0 r0_best_b0_t;
run;

**Merge R1 and find solution with minimum SSE;
proc sort data = w_funs_reg_list; by fundid_mer; run;
data r1_cdim_coeff;
merge w_funs_reg_list %do multin=1 %to 91 %by 15; w_cont_dim_r1_&multin %end; ;
by fundid_mer; 
run;
*find the minimum SSE;
data r1_cdim_coeff;
set r1_cdim_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
run;
*Selected parameters;
*Also save status and t-stat;
data r1_cdim_coeff;
set r1_cdim_coeff;
r1_min_sse_omeg_start = val_1;
r1_best_status = STATUS_1;
r1_best_b0 = r1_b0_est_1;
r1_best_b0_t = r1_b0_t_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r1_min_sse_omeg_start = val_&multin;
r1_best_status = STATUS_&multin;
r1_best_b0 = r1_b0_est_&multin;
r1_best_b0_t = r1_b0_t_&multin;
end;
%end;
keep fundid_mer r1_best_status r1_best_b0 r1_best_b0_t;
run;

**Merge R3 and find solution with minimum SSE;
proc sort data = w_funs_reg_list; by fundid_mer; run;
data r3_cdim_coeff;
merge w_funs_reg_list %do multin=1 %to 91 %by 15; w_cont_dim_r3_&multin %end; ;
by fundid_mer; 
run;
*find the minimum SSE;
data r3_cdim_coeff;
set r3_cdim_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
run;
*Selected parameters;
*Also save status and t-stat;
data r3_cdim_coeff;
set r3_cdim_coeff;
r3_min_sse_omeg_start = val_1;
r3_best_status = STATUS_1;
r3_best_b0 = r3_b0_est_1;
r3_best_b0_t = r3_b0_t_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r3_min_sse_omeg_start = val_&multin;
r3_best_status = STATUS_&multin;
r3_best_b0 = r3_b0_est_&multin;
r3_best_b0_t = r3_b0_t_&multin;
end;
%end;
keep fundid_mer r3_best_status r3_best_b0 r3_best_b0_t;
run;



*Merge all Constrained Dimson Alphas together, then add to the other alphas;
data cdim_coeffs;
merge r0_cdim_coeff r1_cdim_coeff r3_cdim_coeff;
by fundid_mer;
r0cd_alp = r0_best_b0;
r0cd_t = r0_best_b0_t;
r1cd_alp = r1_best_b0;
r1cd_t = r1_best_b0_t;
r3cd_alp = r3_best_b0;
r3cd_t = r3_best_b0_t;
run;

*Add all together;
proc sort data = unsm_reg_select_alpha; by fundid_mer; run;
proc sort data = cdim_coeffs; by fundid_mer; run;

data unsm_reg_select_alpha; 
merge unsm_reg_select_alpha cdim_coeffs;
by fundid_mer; 
run;

*Save alphas etc;
data w_unsm_reg_select_alpha_&yearto;
set unsm_reg_select_alpha;
rol_yyyy_end = &yearto;
run;

%end;


*set together all estimates;

data all_unsm_alpha;
set
%do yearto = &yystart %to &yyend; w_unsm_reg_select_alpha_&yearto %end; ;
run;


%mend rollunsm;
%rollunsm;



********* Save to folder;

data hfpersr.rr2_all_unsm_alpha_2yrs_cdimv2;
set all_unsm_alpha;
run;


******************* END OF STEP 1 (COMPUTATION OF ALPHAS IN ROLLING WINDOWS);

*******************************************************************************************************************************;
*******************************************************************************************************************************;
********** STEP 2: Compute Portfolio Returns and their Alphas;

*Read in Data;
*Data with Alphas;

data all_unsm_alpha; 
set hfpersr.rr2_all_unsm_alpha_2yrs_cdimv2;
run;

*make dataset with rolling alphas and select sample;
*liquidity groups: 1-3 (low), 4-8 (mid), 9-10 (high);

%let sampsel = low_liq_2000;

data v_rol_alphas_select;
set all_unsm_alpha;
*if aut_rank le 10; *all;
if aut_rank le 3; *low liquidity;
*if aut_rank ge 9; *high liquidity;
forw_yyyy = rol_yyyy_end + 1;
if forw_yyyy ge 2000;
keep fundid_mer categ aut_rank
r0_alp r0_t r1_alp r1_t r3_alp r3_t 
r0d_alp r0d_t r1d_alp r1d_t r3d_alp r3d_t 
r0cd_alp r0cd_t r1cd_alp r1cd_t r3cd_alp r3cd_t 
rol_yyyy_end forw_yyyy;
run;

*Make quintiles based on selected sample;
*(note to self: within-category sorts were shown as robustness in revision step, not reported in final accepted version);
proc sort data = v_rol_alphas_select; by /*categ*/ rol_yyyy_end; run;
proc summary data = v_rol_alphas_select; 
var r0_alp r0_t r1_alp r1_t r3_alp r3_t
r0d_alp r0d_t r1d_alp r1d_t r3d_alp r3d_t 
r0cd_alp r0cd_t r1cd_alp r1cd_t r3cd_alp r3cd_t;
output out = w_rol_alphas_quint
p20 = p40 = p60 = p80 = /autoname;
by /*categ*/ rol_yyyy_end; 
run;

data v_rol_alphas_select;
merge v_rol_alphas_select w_rol_alphas_quint;
by /*categ*/ rol_yyyy_end; 
run;

data v_rol_alphas_select;
set v_rol_alphas_select;
*r0;
r0_alp_q = 1;
if r0_alp gt r0_alp_p20 then r0_alp_q = 2;
if r0_alp gt r0_alp_p40 then r0_alp_q = 3;
if r0_alp gt r0_alp_p60 then r0_alp_q = 4;
if r0_alp ge r0_alp_p80 then r0_alp_q = 5;
r0_t_q = 1;
if r0_t gt r0_t_p20 then r0_t_q = 2;
if r0_t gt r0_t_p40 then r0_t_q = 3;
if r0_t gt r0_t_p60 then r0_t_q = 4;
if r0_t ge r0_t_p80 then r0_t_q = 5;
*r1;
r1_alp_q = 1;
if r1_alp gt r1_alp_p20 then r1_alp_q = 2;
if r1_alp gt r1_alp_p40 then r1_alp_q = 3;
if r1_alp gt r1_alp_p60 then r1_alp_q = 4;
if r1_alp ge r1_alp_p80 then r1_alp_q = 5;
r1_t_q = 1;
if r1_t gt r1_t_p20 then r1_t_q = 2;
if r1_t gt r1_t_p40 then r1_t_q = 3;
if r1_t gt r1_t_p60 then r1_t_q = 4;
if r1_t ge r1_t_p80 then r1_t_q = 5;
*r3;
r3_alp_q = 1;
if r3_alp gt r3_alp_p20 then r3_alp_q = 2;
if r3_alp gt r3_alp_p40 then r3_alp_q = 3;
if r3_alp gt r3_alp_p60 then r3_alp_q = 4;
if r3_alp ge r3_alp_p80 then r3_alp_q = 5;
r3_t_q = 1;
if r3_t gt r3_t_p20 then r3_t_q = 2;
if r3_t gt r3_t_p40 then r3_t_q = 3;
if r3_t gt r3_t_p60 then r3_t_q = 4;
if r3_t ge r3_t_p80 then r3_t_q = 5;
*r0d;
r0d_alp_q = 1;
if r0d_alp gt r0d_alp_p20 then r0d_alp_q = 2;
if r0d_alp gt r0d_alp_p40 then r0d_alp_q = 3;
if r0d_alp gt r0d_alp_p60 then r0d_alp_q = 4;
if r0d_alp ge r0d_alp_p80 then r0d_alp_q = 5;
r0d_t_q = 1;
if r0d_t gt r0d_t_p20 then r0d_t_q = 2;
if r0d_t gt r0d_t_p40 then r0d_t_q = 3;
if r0d_t gt r0d_t_p60 then r0d_t_q = 4;
if r0d_t ge r0d_t_p80 then r0d_t_q = 5;
*r1d;
r1d_alp_q = 1;
if r1d_alp gt r1d_alp_p20 then r1d_alp_q = 2;
if r1d_alp gt r1d_alp_p40 then r1d_alp_q = 3;
if r1d_alp gt r1d_alp_p60 then r1d_alp_q = 4;
if r1d_alp ge r1d_alp_p80 then r1d_alp_q = 5;
r1d_t_q = 1;
if r1d_t gt r1d_t_p20 then r1d_t_q = 2;
if r1d_t gt r1d_t_p40 then r1d_t_q = 3;
if r1d_t gt r1d_t_p60 then r1d_t_q = 4;
if r1d_t ge r1d_t_p80 then r1d_t_q = 5;
*r3d;
r3d_alp_q = 1;
if r3d_alp gt r3d_alp_p20 then r3d_alp_q = 2;
if r3d_alp gt r3d_alp_p40 then r3d_alp_q = 3;
if r3d_alp gt r3d_alp_p60 then r3d_alp_q = 4;
if r3d_alp ge r3d_alp_p80 then r3d_alp_q = 5;
r3d_t_q = 1;
if r3d_t gt r3d_t_p20 then r3d_t_q = 2;
if r3d_t gt r3d_t_p40 then r3d_t_q = 3;
if r3d_t gt r3d_t_p60 then r3d_t_q = 4;
if r3d_t ge r3d_t_p80 then r3d_t_q = 5;
*r0cd;
r0cd_alp_q = 1;
if r0cd_alp gt r0cd_alp_p20 then r0cd_alp_q = 2;
if r0cd_alp gt r0cd_alp_p40 then r0cd_alp_q = 3;
if r0cd_alp gt r0cd_alp_p60 then r0cd_alp_q = 4;
if r0cd_alp ge r0cd_alp_p80 then r0cd_alp_q = 5;
r0cd_t_q = 1;
if r0cd_t gt r0cd_t_p20 then r0cd_t_q = 2;
if r0cd_t gt r0cd_t_p40 then r0cd_t_q = 3;
if r0cd_t gt r0cd_t_p60 then r0cd_t_q = 4;
if r0cd_t ge r0cd_t_p80 then r0cd_t_q = 5;
*r1cd;
r1cd_alp_q = 1;
if r1cd_alp gt r1cd_alp_p20 then r1cd_alp_q = 2;
if r1cd_alp gt r1cd_alp_p40 then r1cd_alp_q = 3;
if r1cd_alp gt r1cd_alp_p60 then r1cd_alp_q = 4;
if r1cd_alp ge r1cd_alp_p80 then r1cd_alp_q = 5;
r1cd_t_q = 1;
if r1cd_t gt r1cd_t_p20 then r1cd_t_q = 2;
if r1cd_t gt r1cd_t_p40 then r1cd_t_q = 3;
if r1cd_t gt r1cd_t_p60 then r1cd_t_q = 4;
if r1cd_t ge r1cd_t_p80 then r1cd_t_q = 5;
*r3cd;
r3cd_alp_q = 1;
if r3cd_alp gt r3cd_alp_p20 then r3cd_alp_q = 2;
if r3cd_alp gt r3cd_alp_p40 then r3cd_alp_q = 3;
if r3cd_alp gt r3cd_alp_p60 then r3cd_alp_q = 4;
if r3cd_alp ge r3cd_alp_p80 then r3cd_alp_q = 5;
r3cd_t_q = 1;
if r3cd_t gt r3cd_t_p20 then r3cd_t_q = 2;
if r3cd_t gt r3cd_t_p40 then r3cd_t_q = 3;
if r3cd_t gt r3cd_t_p60 then r3cd_t_q = 4;
if r3cd_t ge r3cd_t_p80 then r3cd_t_q = 5;

drop _TYPE_ _FREQ_
r0_alp_p20 r0_alp_p40 r0_alp_p60 r0_alp_p80
r0_t_p20 r0_t_p40 r0_t_p60 r0_t_p80
r1_alp_p20 r1_alp_p40 r1_alp_p60 r1_alp_p80
r1_t_p20 r1_t_p40 r1_t_p60 r1_t_p80
r3_alp_p20 r3_alp_p40 r3_alp_p60 r3_alp_p80
r3_t_p20 r3_t_p40 r3_t_p60 r3_t_p80
r0d_alp_p20 r0d_alp_p40 r0d_alp_p60 r0d_alp_p80
r0d_t_p20 r0d_t_p40 r0d_t_p60 r0d_t_p80
r1d_alp_p20 r1d_alp_p40 r1d_alp_p60 r1d_alp_p80
r1d_t_p20 r1d_t_p40 r1d_t_p60 r1d_t_p80
r3d_alp_p20 r3d_alp_p40 r3d_alp_p60 r3d_alp_p80
r3d_t_p20 r3d_t_p40 r3d_t_p60 r3d_t_p80
r0cd_alp_p20 r0cd_alp_p40 r0cd_alp_p60 r0cd_alp_p80
r0cd_t_p20 r0cd_t_p40 r0cd_t_p60 r0cd_t_p80
r1cd_alp_p20 r1cd_alp_p40 r1cd_alp_p60 r1cd_alp_p80
r1cd_t_p20 r1cd_t_p40 r1cd_t_p60 r1cd_t_p80
r3cd_alp_p20 r3cd_alp_p40 r3cd_alp_p60 r3cd_alp_p80
r3cd_t_p20 r3cd_t_p40 r3cd_t_p60 r3cd_t_p80;
run;

***********************************************;
*Read FH factors;
data hffact;
set hfautoc.hf_factors;
fh1 = sp500_rf;
fh2 = size_spread;
fh3 = emerg_mkt_rf;
fh4 = FS_bond_mkt;
fh5 = FS_credit_sprd;
fh6 = PTFSBD;
fh7 = PTFSFX;
fh8 = PTFSCOM;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 rf;
run;

*lags;

proc sort data = hffact; by yyyymm; run;
data hffact;
set hffact;
*1 lag;
l1_fh1 = lag1(fh1);
l1_fh2 = lag1(fh2);
l1_fh3 = lag1(fh3);
l1_fh4 = lag1(fh4);
l1_fh5 = lag1(fh5);
l1_fh6 = lag1(fh6);
l1_fh7 = lag1(fh7);
l1_fh8 = lag1(fh8);
*2 lags;
l2_fh1 = lag2(fh1);
l2_fh2 = lag2(fh2);
l2_fh3 = lag2(fh3);
l2_fh4 = lag2(fh4);
l2_fh5 = lag2(fh5);
l2_fh6 = lag2(fh6);
l2_fh7 = lag2(fh7);
l2_fh8 = lag2(fh8);
*3 lags;
l3_fh1 = lag3(fh1);
l3_fh2 = lag3(fh2);
l3_fh3 = lag3(fh3);
l3_fh4 = lag3(fh4);
l3_fh5 = lag3(fh5);
l3_fh6 = lag3(fh6);
l3_fh7 = lag3(fh7);
l3_fh8 = lag3(fh8);
run;


*get forward returns to compute portfolio returns;
*Start by reading in all hedge fund returns;

data hf00_ret;
set hfautoc.hf_merge00_july2020_min5;
if fund_type = 'Fund of Funds' then delete;
keep fundid_mer ret yyyymm assets_fill fund_type stra bh_fund;
run;

***Attach fund classifications;
proc import out = work.Strat_manual_1
datafile = "C:\Users\&pcname.\Dropbox\Research\Hedge Funds\Unsmoothing Returns\Hedge Fund Analysis\Summer_Revision\Strat_manual_1.xlsx"
dbms =xlsx replace; getnames = yes; run;

proc sort data = strat_manual_1; by fund_type stra; run;
proc sort data = hf00_ret; by fund_type stra; run;

data hf00_ret;
merge hf00_ret strat_manual_1; 
by fund_type stra; 
categ = jkt_category;
drop jkt_category;
run;

data hf00_ret;
set hf00_ret;
if categ = 'Other' then delete;
aut_rank = 1;
if categ = 'Event_driven' then aut_rank = 2;
if categ = 'Multi_strategy' then aut_rank = 3;
if categ = 'Emerging_Markets' then aut_rank = 4;
if categ = 'Sector' then aut_rank = 5;
if categ = 'Long_Only' then aut_rank = 6;
if categ = 'Long_Short' then aut_rank = 7;
if categ = 'Market_Neutral' then aut_rank = 8;
if categ = 'Global Macro' then aut_rank = 9;
if categ = 'CTA' then aut_rank = 10;
if categ = 'FOF' then delete;
run;

data hf00_ret;
set hf00_ret;
if yyyymm ge 199501;
if yyyymm le 201712;
run;

*Prepare for forward return analysis;
data v_forwret00;
set hf00_ret;
yyyy = round(yyyymm/100,1);
forw_yyyy = yyyy;
mm = yyyymm - yyyy*100;
keep ret yyyymm fundid_mer yyyy forw_yyyy mm;
run;

proc sort data = v_forwret00; by fundid_mer yyyymm; run;
data v_forwret00;
set v_forwret00;
lag_ret = lag(ret);
if fundid_mer ne lag(fundid_mer) then lag_ret = . ;
run;

*merge lagged alphas and forward returns;
*12 months of forward returns will be attached to each fund;
proc sort data = v_rol_alphas_select; by fundid_mer forw_yyyy; run;
proc sort data = v_forwret00; by fundid_mer forw_yyyy; run;

data forwret00;
merge v_rol_alphas_select(in=a) v_forwret00(in=b); 
by fundid_mer forw_yyyy; 
if a and b;
run;

*Make time series of forward portfolio returns based on each signal (alpha - tstat);
*Make weights for each fund;
*Weight in January is 1 for all funds, then update;

data forwret00; set forwret00; fund_w = 1; run;

proc sort data = forwret00; by fundid_mer yyyymm; run;

*update weights;
%macro fund_wei;

%do i = 2 %to 12;
data forwret00;
set forwret00;
lag_fund_w = lag(fund_w);
if mm = &i then fund_w = lag_fund_w*(1+lag_ret);
run;
%end;

%mend fund_wei;
%fund_wei;


*manually check weights;
data v_check_forwret_weights; set forwret00; if _n_ le 100; run;
*weights are ok;

*Make time series of forward returns for each portfolio;
*By r0_t;
proc sort data = forwret00; by r0_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r0t_yyyymm
mean = portret;
by r0_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r0t_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r0t_yyyymm out = v_portf_r0t_yyyymm_tr prefix = pret_r0t_q;
var portret;
by yyyymm; 
id r0_t_q;
run; 

*By r1_t;
proc sort data = forwret00; by r1_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r1t_yyyymm
mean = portret;
by r1_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r1t_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r1t_yyyymm out = v_portf_r1t_yyyymm_tr prefix = pret_r1t_q;
var portret;
by yyyymm; 
id r1_t_q;
run; 

*By r3_t;
proc sort data = forwret00; by r3_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r3t_yyyymm
mean = portret;
by r3_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r3t_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r3t_yyyymm out = v_portf_r3t_yyyymm_tr prefix = pret_r3t_q;
var portret;
by yyyymm; 
id r3_t_q;
run; 


*DIMSON;
*By r0d_t;
proc sort data = forwret00; by r0d_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r0dt_yyyymm
mean = portret;
by r0d_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r0dt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r0dt_yyyymm out = v_portf_r0dt_yyyymm_tr prefix = pret_r0dt_q;
var portret;
by yyyymm; 
id r0d_t_q;
run; 

*By r1d_t;
proc sort data = forwret00; by r1d_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r1dt_yyyymm
mean = portret;
by r1d_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r1dt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r1dt_yyyymm out = v_portf_r1dt_yyyymm_tr prefix = pret_r1dt_q;
var portret;
by yyyymm; 
id r1d_t_q;
run; 

*By r3d_t;
proc sort data = forwret00; by r3d_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r3dt_yyyymm
mean = portret;
by r3d_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r3dt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r3dt_yyyymm out = v_portf_r3dt_yyyymm_tr prefix = pret_r3dt_q;
var portret;
by yyyymm; 
id r3d_t_q;
run; 


*Constrained DIMSON;
*By r0cd_t;
proc sort data = forwret00; by r0cd_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r0cdt_yyyymm
mean = portret;
by r0cd_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r0cdt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r0cdt_yyyymm out = v_portf_r0cdt_yyyymm_tr prefix = pret_r0cdt_q;
var portret;
by yyyymm; 
id r0cd_t_q;
run; 

*By r1cd_t;
proc sort data = forwret00; by r1cd_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r1cdt_yyyymm
mean = portret;
by r1cd_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r1cdt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r1cdt_yyyymm out = v_portf_r1cdt_yyyymm_tr prefix = pret_r1cdt_q;
var portret;
by yyyymm; 
id r1cd_t_q;
run; 

*By r3cd_t;
proc sort data = forwret00; by r3cd_t_q yyyymm; run; 
proc summary data = forwret00; 
var ret;
output out = v_portf_r3cdt_yyyymm
mean = portret;
by r3cd_t_q yyyymm; 
weight fund_w;
run; 
*transpose;
proc sort data = v_portf_r3cdt_yyyymm; by yyyymm; run; 
proc transpose data = v_portf_r3cdt_yyyymm out = v_portf_r3cdt_yyyymm_tr prefix = pret_r3cdt_q;
var portret;
by yyyymm; 
id r3cd_t_q;
run; 

************************************************;
*Unsmooth these portfolio returns;
*(since these are portfolios returns, use GLM);
data portf_ret_series;
set v_portf_r0t_yyyymm v_portf_r1t_yyyymm v_portf_r3t_yyyymm
v_portf_r0dt_yyyymm v_portf_r1dt_yyyymm v_portf_r3dt_yyyymm
v_portf_r0cdt_yyyymm v_portf_r1cdt_yyyymm v_portf_r3cdt_yyyymm;
drop _TYPE_ _FREQ_; 
portf = "makeinitialsizeemptylong";
if r0_t_q = 1 then portf = "r0_t_q1";
if r0_t_q = 2 then portf = "r0_t_q2";
if r0_t_q = 3 then portf = "r0_t_q3";
if r0_t_q = 4 then portf = "r0_t_q4";
if r0_t_q = 5 then portf = "r0_t_q5";

if r1_t_q = 1 then portf = "r1_t_q1";
if r1_t_q = 5 then portf = "r1_t_q5";

if r3_t_q = 1 then portf = "r3_t_q1";
if r3_t_q = 5 then portf = "r3_t_q5";
*Dimson;
if r0d_t_q = 1 then portf = "r0d_t_q1";
if r0d_t_q = 5 then portf = "r0d_t_q5";

if r1d_t_q = 1 then portf = "r1d_t_q1";
if r1d_t_q = 5 then portf = "r1d_t_q5";

if r3d_t_q = 1 then portf = "r3d_t_q1";
if r3d_t_q = 5 then portf = "r3d_t_q5";
*Constrained Dimson;
if r0cd_t_q = 1 then portf = "r0cd_t_q1";
if r0cd_t_q = 5 then portf = "r0cd_t_q5";

if r1cd_t_q = 1 then portf = "r1cd_t_q1";
if r1cd_t_q = 5 then portf = "r1cd_t_q5";

if r3cd_t_q = 1 then portf = "r3cd_t_q1";
if r3cd_t_q = 5 then portf = "r3cd_t_q5";

if portf = "makeinitialsizeemptylong" then delete;
run;

*make sequence number;
proc sort data = portf_ret_series; by portf yyyymm; run;
data portf_ret_series;
set portf_ret_series;
by portf;
if first.portf then portf_seq = 1;
else portf_seq + 1;
run;

*demean;
proc sort data = portf_ret_series; by portf; run;
proc summary data = portf_ret_series; 
var portret;
output out = w_avg_portret
mean = avg_portret;
by portf; 
run;

data portf_ret_series;
merge portf_ret_series w_avg_portret;
by portf; 
dem_portret = portret - avg_portret;
drop _TYPE_ _FREQ_;
run;


***Now run time series analysis of portfolio returns;
proc sort data = portf_ret_series; by portf yyyymm; run;
proc arima data= portf_ret_series /*out = arima_ma2_est_out*/;
identify var = dem_portret noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= port_ma3_est OUTSTAT= port_ma3_diag noprint;
estimate q= 2 noint ma = -0.2 -0.2 method=ml OUTEST= port_ma2_est OUTSTAT= port_ma2_diag noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= port_ma1_est OUTSTAT= port_ma1_diag noprint;
estimate q= 0 noint method=ml OUTEST= port_ma0_est OUTSTAT= port_ma0_diag noprint;
*forecast noprint;
by portf;
run;
quit;

data port_ma0_diag; set port_ma0_diag; ma0_aic = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data port_ma1_diag; set port_ma1_diag; ma1_aic = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data port_ma2_diag; set port_ma2_diag; ma2_aic = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;
data port_ma3_diag; set port_ma3_diag; ma3_aic = _VALUE_; if _STAT_ = "AIC"; drop _TYPE_ _VALUE_; run;

data portf_ret_series_diag; set portf_ret_series; keep portf; run;
proc sort data = portf_ret_series_diag nodupkey; by portf; run;

data portf_ret_series_diag;
merge portf_ret_series_diag port_ma0_diag port_ma1_diag port_ma2_diag port_ma3_diag;
by portf;
if _STAT_ = "AIC";
run;

data portf_ret_series_diag; *step to identify minimum;
set portf_ret_series_diag;
ma0_win = 0; ma1_win = 0; ma2_win = 0; ma3_win = 0;
if ma0_aic = min(ma0_aic,ma1_aic,ma2_aic,ma3_aic) then ma0_win = 1;
if ma1_aic = min(ma0_aic,ma1_aic,ma2_aic,ma3_aic) then ma1_win = 1;
if ma2_aic = min(ma0_aic,ma1_aic,ma2_aic,ma3_aic) then ma2_win = 1;
if ma3_aic = min(ma0_aic,ma1_aic,ma2_aic,ma3_aic) then ma3_win = 1;
run;


*Estimated thetas;
data port_ma3_est;
set port_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
keep portf ma3_STATUS_ ma3_theta_0 ma3_theta_1 ma3_theta_2 ma3_theta_3 ma3_theta_sum;
run;

data port_ma2_est;
set port_ma2_est;
if _TYPE_ = 'EST';
ma2_STATUS_ = _STATUS_;
ma2_theta_sum = 1 - ma1_1 - ma1_2;
ma2_theta_0 = 1/ma2_theta_sum;
ma2_theta_1 = -ma1_1/ma2_theta_sum;
ma2_theta_2 = -ma1_2/ma2_theta_sum;
ma2_theta_sum_norm = ma2_theta_0 + ma2_theta_1 + ma2_theta_2;
keep portf ma2_STATUS_ ma2_theta_0 ma2_theta_1 ma2_theta_2 ma2_theta_sum;
run;

data port_ma1_est;
set port_ma1_est;
if _TYPE_ = 'EST';
ma1_STATUS_ = _STATUS_;
ma1_theta_sum = 1 - ma1_1;
ma1_theta_0 = 1/ma1_theta_sum;
ma1_theta_1 = -ma1_1/ma1_theta_sum;
ma1_theta_sum_norm = ma1_theta_0 + ma1_theta_1;
keep portf ma1_STATUS_ ma1_theta_0 ma1_theta_1 ma1_theta_sum;
run;

*attach thetas to main dataset;
proc sort data = portf_ret_series; by portf yyyymm; run;

data portf_ret_series01;
merge portf_ret_series port_ma1_est port_ma2_est port_ma3_est;
by portf;
run;

*Attach AIC score;
proc sort data = portf_ret_series_diag; by portf; run;
proc sort data = portf_ret_series01; by portf; run;

data portf_ret_series01; 
merge portf_ret_series01 portf_ret_series_diag; 
by portf; 
aic_ma_win = 0;
if ma1_win = 1 then aic_ma_win = 1;
if ma2_win = 1 then aic_ma_win = 2;
if ma3_win = 1 then aic_ma_win = 3;
run;

*Select winning MA parameters (i.e., highest AIC);

data portf_ret_series01; 
set portf_ret_series01; 
aic_theta_0 = 1; aic_theta_1 = 0; aic_theta_2 = 0; aic_theta_3 = 0;

if aic_ma_win = 1 then aic_theta_0 = ma1_theta_0;
if aic_ma_win = 1 then aic_theta_1 = ma1_theta_1;

if aic_ma_win = 2 then aic_theta_0 = ma2_theta_0;
if aic_ma_win = 2 then aic_theta_1 = ma2_theta_1;
if aic_ma_win = 2 then aic_theta_2 = ma2_theta_2;

if aic_ma_win = 3 then aic_theta_0 = ma3_theta_0;
if aic_ma_win = 3 then aic_theta_1 = ma3_theta_1;
if aic_ma_win = 3 then aic_theta_2 = ma3_theta_2;
if aic_ma_win = 3 then aic_theta_3 = ma3_theta_3;
run;


*Unsmooth the portfolio returns;
*Do AIC-based with K up to 3;

proc sort data = portf_ret_series01; by portf yyyymm; run;

%macro back_out_portrets;

data portf_ret_series01_loop;
set portf_ret_series01;
backed_ret_aic = dem_portret;
keep backed_ret_aic
dem_portret portf yyyymm portf_seq avg_portret
aic_theta_0 aic_theta_1 aic_theta_2 aic_theta_3;
run;

proc sort data = portf_ret_series01_loop; by portf yyyymm; run;

%do i = 4 %to 204; *AIC MA3 loop;

data portf_ret_series01_loop;
set portf_ret_series01_loop;
lag1_backed_ret_aic = lag1(backed_ret_aic);
lag2_backed_ret_aic = lag2(backed_ret_aic);
lag3_backed_ret_aic = lag3(backed_ret_aic);

if portf_seq = &i then backed_ret_aic = (dem_portret - aic_theta_1*lag1_backed_ret_aic - aic_theta_2*lag2_backed_ret_aic - aic_theta_3*lag3_backed_ret_aic)/aic_theta_0;
run;

%end;

%mend back_out_portrets;
%back_out_portrets;

*clean dataset and add back mean;

data portf_ret_series01_loop;
set portf_ret_series01_loop;
temp_backed_ret_aic = backed_ret_aic + avg_portret;
keep portf yyyymm temp_backed_ret_aic;
run;

proc sort data = portf_ret_series01_loop; by portf yyyymm; run;
proc sort data = portf_ret_series01; by portf yyyymm; run;

data portf_ret_series02;
merge portf_ret_series01 portf_ret_series01_loop; 
by portf yyyymm; 
run;

*Adjust mean;
*check std of smoothed and unsmoothed returns;

proc sort data = portf_ret_series02; by portf; run;
proc summary data = portf_ret_series02; 
var portret temp_backed_ret_aic;
output out = v_portf_ret_series02_mean
mean = std = /autoname;
by portf; 
run;

data portf_ret_series02;
merge portf_ret_series02 v_portf_ret_series02_mean;
by portf; 
backed_ret_aic = temp_backed_ret_aic + portret_mean - temp_backed_ret_aic_mean;
drop _TYPE_ _FREQ_;
run;

*Compute differences in portfolio returns (q5 - q1);
*R0;
data w_r0_t_q1; set portf_ret_series02; if portf = "r0_t_q1"; 
r0_t_q1_rep = portret; r0_t_q1_uns = backed_ret_aic; keep yyyymm r0_t_q1_rep r0_t_q1_uns portf_seq; run;

data w_r0_t_q5; set portf_ret_series02; if portf = "r0_t_q5"; 
r0_t_q5_rep = portret; r0_t_q5_uns = backed_ret_aic; keep yyyymm r0_t_q5_rep r0_t_q5_uns portf_seq; run;

*R1;
data w_r1_t_q1; set portf_ret_series02; if portf = "r1_t_q1"; 
r1_t_q1_rep = portret; r1_t_q1_uns = backed_ret_aic; keep yyyymm r1_t_q1_rep r1_t_q1_uns portf_seq; run;

data w_r1_t_q5; set portf_ret_series02; if portf = "r1_t_q5"; 
r1_t_q5_rep = portret; r1_t_q5_uns = backed_ret_aic; keep yyyymm r1_t_q5_rep r1_t_q5_uns portf_seq; run;

*R3;
data w_r3_t_q1; set portf_ret_series02; if portf = "r3_t_q1"; 
r3_t_q1_rep = portret; r3_t_q1_uns = backed_ret_aic; keep yyyymm r3_t_q1_rep r3_t_q1_uns portf_seq; run;

data w_r3_t_q5; set portf_ret_series02; if portf = "r3_t_q5"; 
r3_t_q5_rep = portret; r3_t_q5_uns = backed_ret_aic; keep yyyymm r3_t_q5_rep r3_t_q5_uns portf_seq; run;

*R0 Dimson;
data w_r0d_t_q1; set portf_ret_series02; if portf = "r0d_t_q1"; 
r0d_t_q1_rep = portret; r0d_t_q1_uns = backed_ret_aic; keep yyyymm r0d_t_q1_rep r0d_t_q1_uns portf_seq; run;

data w_r0d_t_q5; set portf_ret_series02; if portf = "r0d_t_q5"; 
r0d_t_q5_rep = portret; r0d_t_q5_uns = backed_ret_aic; keep yyyymm r0d_t_q5_rep r0d_t_q5_uns portf_seq; run;

*R1 Dimson;
data w_r1d_t_q1; set portf_ret_series02; if portf = "r1d_t_q1"; 
r1d_t_q1_rep = portret; r1d_t_q1_uns = backed_ret_aic; keep yyyymm r1d_t_q1_rep r1d_t_q1_uns portf_seq; run;

data w_r1d_t_q5; set portf_ret_series02; if portf = "r1d_t_q5"; 
r1d_t_q5_rep = portret; r1d_t_q5_uns = backed_ret_aic; keep yyyymm r1d_t_q5_rep r1d_t_q5_uns portf_seq; run;

*R3 Dimson;
data w_r3d_t_q1; set portf_ret_series02; if portf = "r3d_t_q1"; 
r3d_t_q1_rep = portret; r3d_t_q1_uns = backed_ret_aic; keep yyyymm r3d_t_q1_rep r3d_t_q1_uns portf_seq; run;

data w_r3d_t_q5; set portf_ret_series02; if portf = "r3d_t_q5"; 
r3d_t_q5_rep = portret; r3d_t_q5_uns = backed_ret_aic; keep yyyymm r3d_t_q5_rep r3d_t_q5_uns portf_seq; run;

*R0 C Dimson;
data w_r0cd_t_q1; set portf_ret_series02; if portf = "r0cd_t_q1"; 
r0cd_t_q1_rep = portret; r0cd_t_q1_uns = backed_ret_aic; keep yyyymm r0cd_t_q1_rep r0cd_t_q1_uns portf_seq; run;

data w_r0cd_t_q5; set portf_ret_series02; if portf = "r0cd_t_q5"; 
r0cd_t_q5_rep = portret; r0cd_t_q5_uns = backed_ret_aic; keep yyyymm r0cd_t_q5_rep r0cd_t_q5_uns portf_seq; run;

*R1 C Dimson;
data w_r1cd_t_q1; set portf_ret_series02; if portf = "r1cd_t_q1"; 
r1cd_t_q1_rep = portret; r1cd_t_q1_uns = backed_ret_aic; keep yyyymm r1cd_t_q1_rep r1cd_t_q1_uns portf_seq; run;

data w_r1cd_t_q5; set portf_ret_series02; if portf = "r1cd_t_q5"; 
r1cd_t_q5_rep = portret; r1cd_t_q5_uns = backed_ret_aic; keep yyyymm r1cd_t_q5_rep r1cd_t_q5_uns portf_seq; run;

*R3 C Dimson;
data w_r3cd_t_q1; set portf_ret_series02; if portf = "r3cd_t_q1"; 
r3cd_t_q1_rep = portret; r3cd_t_q1_uns = backed_ret_aic; keep yyyymm r3cd_t_q1_rep r3cd_t_q1_uns portf_seq; run;

data w_r3cd_t_q5; set portf_ret_series02; if portf = "r3cd_t_q5"; 
r3cd_t_q5_rep = portret; r3cd_t_q5_uns = backed_ret_aic; keep yyyymm r3cd_t_q5_rep r3cd_t_q5_uns portf_seq; run;


*R0 Q5 - Q1;
data w_r0_t_q5q1;
merge w_r0_t_q5 w_r0_t_q1;
portret = r0_t_q5_rep - r0_t_q1_rep;
backed_ret_aic = r0_t_q5_uns - r0_t_q1_uns;
portf = "wdif_r0_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R1 Q5 - Q1;
data w_r1_t_q5q1;
merge w_r1_t_q5 w_r1_t_q1;
portret = r1_t_q5_rep - r1_t_q1_rep;
backed_ret_aic = r1_t_q5_uns - r1_t_q1_uns;
portf = "wdif_r1_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R3 Q5 - Q1;
data w_r3_t_q5q1;
merge w_r3_t_q5 w_r3_t_q1;
portret = r3_t_q5_rep - r3_t_q1_rep;
backed_ret_aic = r3_t_q5_uns - r3_t_q1_uns;
portf = "wdif_r3_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;

*Dimson;
*R0D Q5 - Q1;
data w_r0d_t_q5q1;
merge w_r0d_t_q5 w_r0d_t_q1;
portret = r0d_t_q5_rep - r0d_t_q1_rep;
backed_ret_aic = r0d_t_q5_uns - r0d_t_q1_uns;
portf = "wdif_r0d_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R1D Q5 - Q1;
data w_r1d_t_q5q1;
merge w_r1d_t_q5 w_r1d_t_q1;
portret = r1d_t_q5_rep - r1d_t_q1_rep;
backed_ret_aic = r1d_t_q5_uns - r1d_t_q1_uns;
portf = "wdif_r1d_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R3D Q5 - Q1;
data w_r3d_t_q5q1;
merge w_r3d_t_q5 w_r3d_t_q1;
portret = r3d_t_q5_rep - r3d_t_q1_rep;
backed_ret_aic = r3d_t_q5_uns - r3d_t_q1_uns;
portf = "wdif_r3d_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;

*C Dimson;
*R0CD Q5 - Q1;
data w_r0cd_t_q5q1;
merge w_r0cd_t_q5 w_r0cd_t_q1;
portret = r0cd_t_q5_rep - r0cd_t_q1_rep;
backed_ret_aic = r0cd_t_q5_uns - r0cd_t_q1_uns;
portf = "wdif_r0cd_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R1CD Q5 - Q1;
data w_r1cd_t_q5q1;
merge w_r1cd_t_q5 w_r1cd_t_q1;
portret = r1cd_t_q5_rep - r1cd_t_q1_rep;
backed_ret_aic = r1cd_t_q5_uns - r1cd_t_q1_uns;
portf = "wdif_r1cd_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;
*R3CD Q5 - Q1;
data w_r3cd_t_q5q1;
merge w_r3cd_t_q5 w_r3cd_t_q1;
portret = r3cd_t_q5_rep - r3cd_t_q1_rep;
backed_ret_aic = r3cd_t_q5_uns - r3cd_t_q1_uns;
portf = "wdif_r3cd_t_q5q1";
keep yyyymm portf portret backed_ret_aic portf_seq;
run;


*Set all differences together;
data w_diff_series;
set w_r0cd_t_q5q1 w_r1cd_t_q5q1 w_r3cd_t_q5q1
w_r0d_t_q5q1 w_r1d_t_q5q1 w_r3d_t_q5q1
w_r0_t_q5q1 w_r1_t_q5q1 w_r3_t_q5q1;
run;

*Attach to main dataset;
data portf_ret_series02;
set portf_ret_series02 w_diff_series;
run;


**Attach factors and prepate for portfolio-level regressions;
proc sort data = portf_ret_series02; by yyyymm; run;
proc sort data = hffact; by yyyymm; run;

data portf_ret_series02a; *drop first 3 since they are not unsmoothed;
merge portf_ret_series02 hffact; 
by yyyymm; 
if portf_seq ge 3;
portret_rep = portret - rf;
portret_uns = backed_ret_aic - rf;
*NB: do not subtract rf rate if portfolio is already a difference;
if portf = "wdif_r0_t_q5q1" then portret_rep = portret;
if portf = "wdif_r0_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r1_t_q5q1" then portret_rep = portret;
if portf = "wdif_r1_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r3_t_q5q1" then portret_rep = portret;
if portf = "wdif_r3_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r0d_t_q5q1" then portret_rep = portret;
if portf = "wdif_r0d_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r1d_t_q5q1" then portret_rep = portret;
if portf = "wdif_r1d_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r3d_t_q5q1" then portret_rep = portret;
if portf = "wdif_r3d_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r0cd_t_q5q1" then portret_rep = portret;
if portf = "wdif_r0cd_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r1cd_t_q5q1" then portret_rep = portret;
if portf = "wdif_r1cd_t_q5q1" then portret_uns = backed_ret_aic;

if portf = "wdif_r3cd_t_q5q1" then portret_rep = portret;
if portf = "wdif_r3cd_t_q5q1" then portret_uns = backed_ret_aic;
run;

**************** REGRESSIONS - MAIN 1;
*Unsmoothed portofolio returns, no Dimson lags;

proc sort data = portf_ret_series02a; by portf; run;
proc reg data = portf_ret_series02a outest = v_portf_fhreg1 noprint tableout;
model portret_uns = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
by portf; 
run; quit;

data portf_fhreg1;
set v_portf_fhreg1;
if _TYPE_ = "PARMS" or _TYPE_ = "T";
keep portf _MODEL_ _TYPE_ _DEPVAR_ intercept _ADJRSQ_;
run;

*Adjust - organize;
proc sort data = portf_fhreg1; by portf; run;

data w_portf_fhreg1_alpha_uns;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
uns_alpha = intercept*12*100;
if _TYPE_ = "PARMS";
keep portf uns_alpha;
run;

data w_portf_fhreg1_t_uns;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
uns_t = intercept;
if _TYPE_ = "T";
keep portf uns_t;
run;

data v_tab_portf_uns_&sampsel;
merge w_portf_fhreg1_alpha_uns w_portf_fhreg1_t_uns;
by portf;
run;

*clean for output;

data tab_portf_uns_&sampsel;
set v_tab_portf_uns_&sampsel;
keep portf uns_alpha uns_t;
run;


**************** REGRESSIONS - MAIN 2;
*** Regress reported portfolio returns (not unsmoothed);
*.. on lagged factors to obtain Dimson alphas;
*(Note to self: this is not reported in the paper - was only shown in revision steps);

proc sort data = portf_ret_series02a; by portf; run;
proc reg data = portf_ret_series02a outest = v_portf_fhreg1 noprint tableout;
model portret_rep = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8
/edf ADJRSQ;
by portf; 
run; quit;

data portf_fhreg1;
set v_portf_fhreg1;
if _TYPE_ = "PARMS" or _TYPE_ = "T";
keep portf _MODEL_ _TYPE_ _DEPVAR_ intercept _ADJRSQ_;
run;

*Adjust - organize;
proc sort data = portf_fhreg1; by portf; run;

data w_portf_fhreg1_alpha_dim;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
dim_alpha = intercept*12*100;
if _TYPE_ = "PARMS";
keep portf dim_alpha;
run;

data w_portf_fhreg1_t_dim;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
dim_t = intercept;
if _TYPE_ = "T";
keep portf dim_t;
run;

data v_tab_portf_dim_&sampsel;
merge w_portf_fhreg1_alpha_dim w_portf_fhreg1_t_dim;
by portf;
run;

*clean for output;
data tab_portf_dim_&sampsel;
set v_tab_portf_dim_&sampsel;
keep portf dim_alpha dim_t;
run;



**************** REGRESSIONS - MAIN 3;
*** Regress 1-step unsmoothed portfolio returns;
*.. on lagged factors to obtain Dimson alphas;
*Note: this is going to be in the internet appendix (IA8);

proc sort data = portf_ret_series02a; by portf; run;
proc reg data = portf_ret_series02a outest = v_portf_fhreg1 noprint tableout;
model portret_uns = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8
/edf ADJRSQ;
by portf; 
run; quit;

data portf_fhreg1;
set v_portf_fhreg1;
if _TYPE_ = "PARMS" or _TYPE_ = "T";
keep portf _MODEL_ _TYPE_ _DEPVAR_ intercept _ADJRSQ_;
run;

*Adjust - organize;
proc sort data = portf_fhreg1; by portf; run;

data w_portf_fhreg1_alpha_dim;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
dim_alpha = intercept*12*100;
if _TYPE_ = "PARMS";
keep portf dim_alpha;
run;

data w_portf_fhreg1_t_dim;
set portf_fhreg1;
if _MODEL_ = "MODEL1";
dim_t = intercept;
if _TYPE_ = "T";
keep portf dim_t;
run;

data v_tab_portf_unsdim_&sampsel;
merge w_portf_fhreg1_alpha_dim w_portf_fhreg1_t_dim;
by portf;
run;

*clean for output;
data tab_portf_unsdim_&sampsel;
set v_tab_portf_unsdim_&sampsel;
keep portf dim_alpha dim_t;
run;


**************** REGRESSIONS - MAIN 4;
*** Regress 1-step unsmoothed portfolio returns;
*.. CDimson factors, with flexible CDimson (this step added in second revision);


data w_portf_list; set portf_ret_series02a; keep portf; run;
proc sort data = w_portf_list nodupkey; by portf; run;

%macro maincdim;

%do multin= 1 %to 91 %by 15;
%let multi = %sysevalf(0.01*&multin);

proc sort data = portf_ret_series02a; by portf; run;
proc nlin data= portf_ret_series02a outest = w_portf_fhreg4 SMETHOD=GOLDEN /*noprint*/;
ods output parameterestimates= w_fhreg4_est_&multin;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi omeg3 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1, 0<= omeg3 <= 1;
model portret_uns = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8
+omeg3*b1*l3_fh1 +omeg3*b2*l3_fh2 +omeg3*b3*l3_fh3 +omeg3*b4*l3_fh4 +omeg3*b5*l3_fh5 +omeg3*b6*l3_fh6 +omeg3*b7*l3_fh7 +omeg3*b8*l3_fh8;
by portf;
run; quit;

data w_portf_fhreg4_&multin; *rename and clean;
set w_portf_fhreg4;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
alp_cd3_&multin = b0;
fh1_cd3_&multin = b1;
fh2_cd3_&multin = b2;
fh3_cd3_&multin = b3;
fh4_cd3_&multin = b4;
fh5_cd3_&multin = b5;
fh6_cd3_&multin = b6;
fh7_cd3_&multin = b7;
fh8_cd3_&multin = b8;
keep portf val_&multin STATUS_&multin SSE_&multin alp_cd3_&multin
fh1_cd3_&multin fh2_cd3_&multin fh3_cd3_&multin fh4_cd3_&multin
fh5_cd3_&multin fh6_cd3_&multin fh7_cd3_&multin fh8_cd3_&multin;
run;

*rename t-stat dataset;
data w_fhreg4_est_&multin;
set w_fhreg4_est_&multin;
if parameter = "b0";
alp_&multin = estimate;
t_alp_&multin = tValue;
keep portf alp_&multin t_alp_&multin;
run;

%end;

***Merge together;
**Merge R0 coefficients and find solution with minimum SSE - 2 lags;
proc sort data = w_portf_list; by portf; run;
data portf_fhreg4_coeff;
merge w_portf_list %do multin=1 %to 91 %by 15; w_portf_fhreg4_&multin %end; ;
by portf; 
run;
*find the minimum SSE;
data portf_fhreg4_coeff;
set portf_fhreg4_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
min_sse_fhreg4 = min_sse;
run;
*Selected parameters;
data portf_fhreg4_coeff;
set portf_fhreg4_coeff;
fhreg4_min_sse_omeg_start = val_1;
fhreg4_best_status = STATUS_1;
fhreg4_best_alp = alp_cd3_1;
fhreg4_best_fh1 = fh1_cd3_1;
fhreg4_best_fh2 = fh2_cd3_1;
fhreg4_best_fh3 = fh3_cd3_1;
fhreg4_best_fh4 = fh4_cd3_1;
fhreg4_best_fh5 = fh5_cd3_1;
fhreg4_best_fh6 = fh6_cd3_1;
fhreg4_best_fh7 = fh7_cd3_1;
fhreg4_best_fh8 = fh8_cd3_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
fhreg4_min_sse_omeg_start = val_&multin;
fhreg4_best_status = STATUS_&multin;
fhreg4_best_alp = alp_cd3_&multin;
fhreg4_best_fh1 = fh1_cd3_&multin;
fhreg4_best_fh2 = fh2_cd3_&multin;
fhreg4_best_fh3 = fh3_cd3_&multin;
fhreg4_best_fh4 = fh4_cd3_&multin;
fhreg4_best_fh5 = fh5_cd3_&multin;
fhreg4_best_fh6 = fh6_cd3_&multin;
fhreg4_best_fh7 = fh7_cd3_&multin;
fhreg4_best_fh8 = fh8_cd3_&multin;
end;
%end;
keep portf fhreg4_best_status fhreg4_min_sse_omeg_start fhreg4_best_alp
fhreg4_best_fh1 fhreg4_best_fh2 fhreg4_best_fh3 fhreg4_best_fh4
fhreg4_best_fh5 fhreg4_best_fh6 fhreg4_best_fh7 fhreg4_best_fh8
min_sse_fhreg4;
run;

***Merge dataset with t-stats;
**Merge R0 coefficients and find solution with minimum SSE - 2 lags;
proc sort data = w_portf_list; by portf; run;
data v_portf_fhreg4_t;
merge w_portf_list %do multin=1 %to 91 %by 15; w_fhreg4_est_&multin %end; ;
by portf; 
run;


%mend maincdim;
%maincdim;

*Adjust - organize;
proc sort data = portf_fhreg4_coeff; by portf; run;

data portf_fhreg4_alpha;
set portf_fhreg4_coeff;
fhreg4_alpha = fhreg4_best_alp*12*100;
keep portf fhreg4_alpha fhreg4_min_sse_omeg_start;
run;

*attach t-stats and select them;
proc sort data = portf_fhreg4_alpha; by portf; run;
proc sort data = v_portf_fhreg4_t; by portf; run;

data portf_fhreg4_alpha_t; *merge and select t-stat;
merge portf_fhreg4_alpha v_portf_fhreg4_t;
by portf; 
fhreg4_t = t_alp_1;
if fhreg4_min_sse_omeg_start = 0.16 then fhreg4_t = t_alp_16;
if fhreg4_min_sse_omeg_start = 0.31 then fhreg4_t = t_alp_31;
if fhreg4_min_sse_omeg_start = 0.46 then fhreg4_t = t_alp_46;
if fhreg4_min_sse_omeg_start = 0.61 then fhreg4_t = t_alp_61;
if fhreg4_min_sse_omeg_start = 0.76 then fhreg4_t = t_alp_76;
if fhreg4_min_sse_omeg_start = 0.91 then fhreg4_t = t_alp_91;
run;

*clean for output;
data tab_portf_unscdim_&sampsel;
set portf_fhreg4_alpha_t;
keep portf fhreg4_alpha fhreg4_t;
run;




*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 5: SIMULATION: MA(1) WITH 1-FACTOR MODEL;
****Produces: Table 5, Panel A;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;

************** MAIN SIMULATION --- TABLE 5; 
***** THE FOLLOWING CODE IS FOR TABLE 5, PANEL A;


*Panel A;
*1 Factor;
*Baseline parameters;
*Alpha: N(0,2%);
*Beta: N(1,0.25);
*Factor: N(0.6%,4%);
*Epsilon = N(0,4%);

*Model fund details based on Relative Value Funds;
*N = 670;
*T = 85;
*3 versions;
*Version 1: phi1 = 0.3, pai1 = 0.3;
*Version 2: phi1 = 0.2, pai1 = 0.4;
*Version 3: phi1 = 0.4, pai1 = 0.2;

*Import fh factors;

data hffact;
set hfautoc.hf_factors;
run;

data factor1;
set hffact;
fa1 = sp500_rf;
l1_fa1 = lag1(sp500_rf);
fdumm = 1;
fact_seq + 1;
keep yyyymm fa1 l1_fa1 fdumm fact_seq;
run;

proc sort data = factor1; by yyyymm; run;

proc summary data = factor1;
var fa1;
output out = factor_stats 
mean = std = /autoname;
run;

proc corr data = factor1 outp = v_factor1_corr noprint;
var fa1 l1_fa1;
run;

************************** START LOOP;

***How many Loops;
%let n_loops = 1000;


%macro bigloop;

%do version = 1 %to 3;

%if &version = 1 %then %do;
%let n_periods = 85;
%let n_funds = 670;
%let phi0 = 0.70; *3-step weight on relative, phi1 = 0.3;
%let pai0 = 0.70; *3-step weight on aggregate, pai1 = 0.3;
%let f1_mean = 0.006;
%let f1_std = 0.04;
%let alpha1_mean = 0;
%let alpha1_std = 0.02/12;
%let beta1_mean = 1;
%let beta1_std = 0.25;
%let err_std = 0.04;
%end;

%else %if &version = 2 %then %do;
%let n_periods = 85;
%let n_funds = 670;
%let phi0 = 0.80; *3-step weight on relative, phi1 = 0.2;
%let pai0 = 0.60; *3-step weight on aggregate, pai1 = 0.4;
%let f1_mean = 0.006;
%let f1_std = 0.04;
%let alpha1_mean = 0;
%let alpha1_std = 0.02/12;
%let beta1_mean = 1;
%let beta1_std = 0.25;
%let err_std = 0.04;
%end;

%else %if &version = 3 %then %do;
%let n_periods = 85;
%let n_funds = 670;
%let phi0 = 0.60; *3-step weight on relative, phi1 = 0.4;
%let pai0 = 0.80; *3-step weight on aggregate, pai1 = 0.2;
%let f1_mean = 0.006;
%let f1_std = 0.04;
%let alpha1_mean = 0;
%let alpha1_std = 0.02/12;
%let beta1_mean = 1;
%let beta1_std = 0.25;
%let err_std = 0.04;
%end;


%do loopn = 1 %to &n_loops;


****** Simulate Funds;

data fundid_list;
do fundid = 1 to %eval(&n_funds);
output;
end;
run;

data fundid_list;
set fundid_list;
call streaminit(&loopn);
fdumm = 1;
alp=RAND('NORMAL',&alpha1_mean,&alpha1_std);
beta1=RAND('NORMAL',&beta1_mean,&beta1_std);
run;

*factors;

%let factors_n = 100;

%do i = 1 %to &factors_n;

data wfact_draw&i;
set factor1;
call streaminit(%eval(&loopn*1000 + &i));
f1 = RAND('NORMAL',&f1_mean,&f1_std);
lag1_f1 = lag1(f1);
lag2_f1 = lag2(f1);
fact_i = &i;
keep yyyymm fact_seq f1 lag1_f1 lag2_f1 fdumm fact_i;
run;

%end;

data wfact_draw_all;
set %do i = 1 %to &factors_n; wfact_draw&i %end; ;
run;

data wfact_draw_all_gt12; set wfact_draw_all; if fact_seq gt 12; if fact_seq le %eval(&n_periods+12); run;

proc corr data = wfact_draw_all_gt12 out = wfact_draw_corr(where=(_TYPE_="CORR")) noprint;
var f1;
with lag1_f1 lag2_f1;
by fact_i;
run;

data wfact_draw_corr1; set wfact_draw_corr; if _NAME_ = 'lag1_f1'; fcorr1 = f1; keep fact_i fcorr1; run;
data wfact_draw_corr2; set wfact_draw_corr; if _NAME_ = 'lag2_f1'; fcorr2 = f1; keep fact_i fcorr2; run;

data wfact_draw_corr12; 
merge wfact_draw_corr1 wfact_draw_corr2; 
fcorr_abssum = abs(fcorr1) + abs(fcorr2);
run;

proc sort data =  wfact_draw_corr12; by fcorr_abssum; run;
data wfact_draw_corr12_keep;
set wfact_draw_corr12; 
call streaminit(&loopn);
if _n_ le 3;
u1 = rand("Uniform");
run;

proc sort data =  wfact_draw_corr12_keep; by u1; run;
data wfact_draw_corr12_keep; *pick one set of factors at random;
set wfact_draw_corr12_keep; 
if _n_ = 1;
keep_factor_i = 1;
run;

proc sort data = wfact_draw_corr12_keep; by fact_i; run;
proc sort data = wfact_draw_all; by fact_i; run;

data wfact_draw_all_select; 
merge wfact_draw_all(in=a) wfact_draw_corr12_keep;
by fact_i; 
if a;
if keep_factor_i = 1;
run;


data fact_draw; *clean;
set wfact_draw_all_select; 
if fact_seq le %eval(&n_periods+12);
drop fact_i keep_factor_i fcorr1 fcorr2 fcorr_abssum u1;
run;

*keep;
data drawn_factor;
set wfact_draw_all_select; 
run;


*attach factor to make frame;
proc sql; create table fund_frame1 as select * 
from fundid_list, fact_draw
where fundid_list.fdumm = fact_draw.fdumm;
quit;

proc sort data = fund_frame1; by fundid yyyymm; run;

*draw error and compute real return;
data fund_frame1;
set fund_frame1;
call streaminit(&loopn);
err_draw = RAND('NORMAL',0,&err_std);
realr = alp + beta1*f1 + err_draw;
lag1_realr = lag1(realr);
run;

*Compute real aggregate return;
*Then compute 3-step observed return;

proc sort data = fund_frame1; by yyyymm; run;
proc summary data = fund_frame1; 
var realr lag1_realr;
output out = v_real_aggr_ret mean = agg_realr lag1_agg_realr;
by yyyymm; 
run;

data fund_frame1;
merge fund_frame1 v_real_aggr_ret;
by yyyymm; 
rela_realr = realr - agg_realr;
lag1_rela_realr = lag1_realr - lag1_agg_realr;
drop _TYPE_ _FREQ_;
run;

*Compute 3-step observed return;
proc sort data = fund_frame1; by fundid yyyymm; run;
data fund_frame1;
set fund_frame1;
obsr = &phi0*rela_realr +(1-&phi0)*lag1_rela_realr + &pai0*agg_realr +(1-&pai0)*lag1_agg_realr; *3-step observed return;
run;

*set first to missing;
data fund_frame1;
set fund_frame1;
lag1_obsr = lag1(obsr);
if fundid ne lag(fundid) then lag1_obsr = . ;
run;

*aggregate observed return;
proc sort data = fund_frame1; by yyyymm; run;
proc summary data = fund_frame1; 
var obsr;
output out = v_aggr_obs mean = aggr_obsr;
by yyyymm; run;

data fund_frame1; 
merge fund_frame1 v_aggr_obs; 
by yyyymm; 
drop _TYPE_ _FREQ_;
run;

/**************************************************************************************/

*****Proceed to unsmooth the simulated returns;

/******************************************************************************************************************/
**** Part 1: standard MA method (as in GLM(2004));


data fund_frame2;
set fund_frame1;
if fact_seq gt 12; *the first 12 obs were just for drawing the parameters and then smoothing the returns;
run;
proc sort data = fund_frame2; by fundid yyyymm; run;
data fund_frame2;
set fund_frame2;
by fundid;
if first.fundid then fund_seq = 1;
else fund_seq + 1;
run;

proc sort data = fund_frame2; by fundid yyyymm; run;
proc summary data = fund_frame2; 
var obsr;
output out = v_avgfundret mean = avfund_obsr;
by fundid; 
run;

data fund_frame2;
merge fund_frame2 v_avgfundret;
by fundid;
n_fund_obs = _FREQ_;
dem_obsr = obsr - avfund_obsr;
drop _TYPE_ _FREQ_;
run;

*MA with 1 lags;

proc sort data = fund_frame2; by fundid yyyymm; run;
proc arima data= fund_frame2;
identify var = dem_obsr noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= arima_ma1_est OUTSTAT=arima_ma1_diag noprint;
forecast noprint;
by fundid;
run;
quit;


*estimated MA parameters;
data arima_ma1_est;
set arima_ma1_est;
if _TYPE_ = 'EST';
s1_ma1_STATUS = _STATUS_;
ma1_theta_sum = 1 - ma1_1;
ma1_theta_0 = 1/ma1_theta_sum;
ma1_theta_1 = -ma1_1/ma1_theta_sum;
ma1_theta_sum_norm = ma1_theta_0 + ma1_theta_1;
keep fundid s1_ma1_STATUS ma1_theta_0 ma1_theta_1 ma1_theta_sum;
run;


*attach thetas to main dataset;
data fund_frame2;
merge fund_frame2 arima_ma1_est;
by fundid;
run;


*Do not unsmooth if MA values do not converge;
data fund_frame2;
set fund_frame2;
fix_theta_abs_gt_1_5 = 0;
if ma1_theta_0 > 1.25 or ma1_theta_0 < -0.45 then fix_theta_abs_gt_1_5 = 1;
if ma1_theta_1 > 1.25 or ma1_theta_1 < -0.45 then fix_theta_abs_gt_1_5 = 1;
if s1_ma1_STATUS ne "0 Converged" then fix_theta_abs_gt_1_5 = 1;
if fix_theta_abs_gt_1_5 = 1 then ma1_theta_0 = 1;
if fix_theta_abs_gt_1_5 = 1 then ma1_theta_1 = 0;
run;

*save;
data v_save_thetas;
set fund_frame2;
keep fundid ma1_theta_0 ma1_theta_1;
run;

proc sort data = v_save_thetas nodupkey; by fundid; run;

/********************************************************/
*back out Unsmoothed returns using 1-step method;

proc sort data = fund_frame2; by fundid yyyymm; run;

data fund_frame2_loop;
set fund_frame2;
backed_ret_fix = dem_obsr;
keep backed_ret_fix obsr dem_obsr fundid yyyymm fund_seq avfund_obsr
ma1_theta_0 ma1_theta_1;
run;

proc sort data = fund_frame2_loop; by fundid yyyymm; run;

%do i = 2 %to &n_periods;

data fund_frame2_loop;
set fund_frame2_loop;
lag1_backed_ret_fix = lag1(backed_ret_fix);
if fund_seq = &i then backed_ret_fix = (dem_obsr - ma1_theta_1*lag1_backed_ret_fix)/ma1_theta_0;
run;

%end;


*clean dataset and add back mean fund return;
data fund_frame2_loop;
set fund_frame2_loop;
temp_backed_s1ret = backed_ret_fix + avfund_obsr;
keep fundid yyyymm temp_backed_s1ret obsr;
run;


proc sort data = fund_frame2_loop; by fundid; run;
proc summary data = fund_frame2_loop; 
var temp_backed_s1ret obsr;
output out = w_s1_adj_avg 
mean = /autoname;
by fundid; 
run;

data fund_frame2_loop;
merge fund_frame2_loop w_s1_adj_avg;
backed_s1ret = temp_backed_s1ret - temp_backed_s1ret_mean + obsr_mean;
by fundid; 
keep fundid yyyymm backed_s1ret;
run;

*attach to dataset;
proc sort data = fund_frame2; by fundid yyyymm; run;
proc sort data = fund_frame2_loop; by fundid yyyymm; run;

data fund_frame2; 
merge fund_frame2 fund_frame2_loop;
by fundid yyyymm; 
run;


/********************************************************/
*back out Unsmoothed returns using 3-step method;

*Start by unsmoothing aggregate component;

data s1ag00;
set fund_frame2;
keep yyyymm aggr_obsr;
run;

proc sort data = s1ag00 nodupkey; by yyyymm; run;

data s1ag00;
set s1ag00;
agg_aux_seq + 1;
aggdumm = 1;
run;

*demean;

proc summary data = s1ag00; 
var aggr_obsr;
output out = w_mean_ew_ret mean = av_aggr_obsr;
by aggdumm; run;

data s1ag00;
merge s1ag00 w_mean_ew_ret;
by aggdumm;
dem_aggr_obsr = aggr_obsr - av_aggr_obsr;
drop _TYPE_ _FREQ_;
run;

*MA(1);
proc sort data = s1ag00; by yyyymm; run;
proc arima data= s1ag00;
identify var = dem_aggr_obsr noprint;
estimate q= 1 noint ma = -0.2 method=ml OUTEST= ag_arima_ma1_est OUTSTAT=ag_arima_ma1_diag noprint;
forecast noprint;
run;
quit;

data ag_arima_ma1_est;
set ag_arima_ma1_est;
if _TYPE_ = 'EST';
s3_aggr_ma1_STATUS = _STATUS_;
ma1_pai_sum = 1 - ma1_1;
ma1_pai_0 = 1/ma1_pai_sum;
ma1_pai_1 = -ma1_1/ma1_pai_sum;
ma1_pai_sum_norm = ma1_pai_0 + ma1_pai_1;
aggdumm = 1;
keep aggdumm s3_aggr_ma1_STATUS ma1_pai_0 ma1_pai_1 ma1_pai_sum;
run;

*save pais;
data v_save_pais;
set ag_arima_ma1_est;
dumm = 1;
keep ma1_pai_0 ma1_pai_1 dumm;
run;

*attach pais to main dataset;
data s1ag01;
merge s1ag00 ag_arima_ma1_est;
by aggdumm;
drop _TYPE_ _FREQ_;
run;

*back out estimated unsmoothed aggregate returns;
proc sort data = s1ag01; by yyyymm; run;

data s1ag01_loop;
set s1ag01;
backed_aggrret_fix = dem_aggr_obsr;
keep yyyymm agg_aux_seq backed_aggrret_fix dem_aggr_obsr av_aggr_obsr aggdumm
ma1_pai_0 ma1_pai_1;
run;

%do i = 2 %to &n_periods;

data s1ag01_loop;
set s1ag01_loop;
lag1_backed_aggrret_fix = lag1(backed_aggrret_fix);
if agg_aux_seq = &i then backed_aggrret_fix = (dem_aggr_obsr - ma1_pai_1*lag1_backed_aggrret_fix)/ma1_pai_0;
run;

%end;


*check average backed ret by category;
proc summary data = s1ag01_loop; 
var backed_aggrret_fix;
output out = check_av_s1ag01_loop mean = /autoname;
by aggdumm;
run;

data s1ag01_loop; 
merge s1ag01_loop check_av_s1ag01_loop;
backed_aggrret_fix = backed_aggrret_fix - backed_aggrret_fix_mean;
by aggdumm; 
run;

*clean dataset and add back mean;
data s1ag01_loop;
set s1ag01_loop;
dem_backed_aggrret_fix = backed_aggrret_fix;
backed_aggrret_fix = backed_aggrret_fix + av_aggr_obsr;
keep yyyymm backed_aggrret_fix dem_backed_aggrret_fix;
run;

proc sort data = s1ag01_loop; by yyyymm; run;
proc sort data = s1ag01; by yyyymm; run;

data s1ag02;
merge s1ag01 s1ag01_loop; 
by yyyymm; 
run;


****************************************************************;
*Continue to step 2;
*SECOND STEP: OBTAIN FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);

data s2fund00; *fund-level data;
set fund_frame2;
keep yyyymm obsr  fundid fund_seq;
run;

data w_au_ewcatert; 
set s1ag02;
keep yyyymm aggr_obsr backed_aggrret_fix;
run;

proc sort data = w_au_ewcatert; by yyyymm; run;
data w_au_ewcatert;
set w_au_ewcatert;
*1 lag;
lag1_backed_aggrret_fix = lag1(backed_aggrret_fix);
run;

*attach strategy-month returns and demean;
proc sort data = s2fund00; by yyyymm; run;
proc sort data = w_au_ewcatert; by yyyymm; run;

data s2fund00;
merge s2fund00 w_au_ewcatert; 
by yyyymm; 
ret_excat = obsr - aggr_obsr;
run;

*fund average return in excess of the category return;
proc sort data = s2fund00; by fundid; run;
proc summary data = s2fund00; 
var ret_excat backed_aggrret_fix lag1_backed_aggrret_fix;
output out = w_mean_excat_ret 
mean = av_retexcat_fundid av_backed_aggrret_fix_fund av_lag1_backed_aggrret_fix_fund;
by fundid; 
run;

data s2fund00;
merge s2fund00 w_mean_excat_ret;
by fundid; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_aggrret_fix = backed_aggrret_fix - av_backed_aggrret_fix_fund;
dem_lag1_backed_aggrret_fix = lag1_backed_aggrret_fix - av_lag1_backed_aggrret_fix_fund;
drop _TYPE_ _FREQ_;
run;



*Estimate MA process with 1 lag;
proc sort data = s2fund00; by fundid yyyymm; run;
proc arima data= s2fund00;
identify var = dem_ret_excat crosscorr = (dem_backed_aggrret_fix dem_lag1_backed_aggrret_fix) noprint;
estimate q= 1 input = (dem_backed_aggrret_fix dem_lag1_backed_aggrret_fix) noint ma = -0.2 method=ml OUTEST= excat_arima_ma1fix_est_v2 OUTSTAT= excat_arima_ma1fix_diag_v2 noprint;
by fundid;
run;
quit;

data excat_arima_ma1fix_est_v2;
set excat_arima_ma1fix_est_v2;
if _TYPE_ = 'EST';
s2_rel_ma1_STATUS = _STATUS_;
ma1_phi_sum = 1 - ma1_1;
ma1_phi_0 = 1/ma1_phi_sum;
ma1_phi_1 = -ma1_1/ma1_phi_sum;
ma1_ew_l0 = i1_1;
ma1_ew_l1 = i2_1;
keep fundid s2_rel_ma1_STATUS ma1_phi_0 ma1_phi_1 ma1_phi_sum ma1_ew_l0 ma1_ew_l1;
run;

proc sort data = s2fund00; by fundid; run;
proc sort data = excat_arima_ma1fix_est_v2; by fundid; run;

data s2fund01;
merge s2fund00 excat_arima_ma1fix_est_v2; 
by fundid; 
run;

data s2fund01;
set s2fund01;
fix_phi_abs_gt_1_5 = 0;
if ma1_phi_0 > 1.25 or ma1_phi_0 < -0.45 then fix_phi_abs_gt_1_5 = 1;
if ma1_phi_1 > 1.25 or ma1_phi_1 < -0.45 then fix_phi_abs_gt_1_5 = 1;
if s2_rel_ma1_STATUS ne "0 Converged" then fix_phi_abs_gt_1_5 = 1;
if fix_phi_abs_gt_1_5 = 1 then ma1_phi_0 = 1;
if fix_phi_abs_gt_1_5 = 1 then ma1_phi_1 = 0;
run;

*save phis;
data v_save_phis;
set s2fund01;
keep fundid ma1_phi_0 ma1_phi_1 ma1_ew_l0 ma1_ew_l1;
run;

proc sort data = v_save_phis nodupkey; by fundid; run;

*back out estimated unsmoothed excess returns;
proc sort data = s2fund01; by fundid yyyymm; run;

data s2fund01_loop;
set s2fund01;
backed_ret_excat_fix = dem_ret_excat;
keep backed_ret_excat_fix dem_ret_excat fundid yyyymm fund_seq
ma1_phi_0 ma1_phi_1 av_retexcat_fundid
dem_backed_aggrret_fix dem_lag1_backed_aggrret_fix ma1_ew_l0 ma1_ew_l1;
run;

proc sort data = s2fund01_loop; by fundid yyyymm; run;

%do i = 2 %to &n_periods;

data s2fund01_loop;
set s2fund01_loop;
lag1_backed_ret_excat_fix = lag1(backed_ret_excat_fix);
if fund_seq = &i then backed_ret_excat_fix = (dem_ret_excat -ma1_phi_1*lag1_backed_ret_excat_fix)/ma1_phi_0;
run;

%end;


*clean dataset and keep adjusted residual;
data s2fund01_loop;
set s2fund01_loop;
res_backed_ret_excat_fix = backed_ret_excat_fix;
keep fundid yyyymm res_backed_ret_excat_fix;
run;

proc sort data = s2fund01_loop; by fundid yyyymm; run;
proc sort data = s2fund01; by fundid yyyymm; run;

data s2fund02;
merge s2fund01 s2fund01_loop; 
by fundid yyyymm; 
run;


**** STEP 3 : Add up residuals from steps 1 and 2;

*aggregate MA residual;
data s1_resid;
set s1ag02;
keep yyyymm av_aggr_obsr 
backed_aggrret_fix dem_backed_aggrret_fix;
run;

*fund-level cat-excess return MA residual;
data s2_resid;
set s2fund02;
keep fundid yyyymm dem_ret_excat res_backed_ret_excat_fix;
run;

*attach to main dataset;
proc sort data = fund_frame2; by yyyymm; run;
proc sort data = s1_resid; by yyyymm; run;

data s3_00;
merge fund_frame2 s1_resid; 
by yyyymm; 
run;

proc sort data = s3_00; by fundid yyyymm; run;
proc sort data = s2_resid; by fundid yyyymm; run;

data s3_00;
merge s3_00 s2_resid; 
by fundid yyyymm;
run;

*Calculate unsmoothed return with new method;

data s3_00;
set s3_00;
s3_uns_ret_fix_temp = dem_backed_aggrret_fix + res_backed_ret_excat_fix + avfund_obsr;
run;

*ensure mean has not changed;
proc sort data = s3_00; by fundid; run;
proc summary data = s3_00; 
var s3_uns_ret_fix_temp obsr;
output out = w_adj_s3_uns mean = /autoname;
by fundid; 
run;

data s3_00;
merge s3_00 w_adj_s3_uns;
by fundid; 
s3_uns_ret_fix = s3_uns_ret_fix_temp - s3_uns_ret_fix_temp_mean + obsr_mean;
drop s3_uns_ret_fix_temp_mean obsr_mean _TYPE_ _FREQ_;
run;

*outliers?; 
proc sort data = s3_00; by fundid; run;
proc summary data = s3_00; 
var obsr backed_s1ret s3_uns_ret_fix;
output out = v_check_std_unsm std = r0_std r1_std r3_std;
by fundid; 
run;

data v_check_std_unsm;
set v_check_std_unsm;
r1_r0_std_ratio = r1_std/r0_std;
r3_r0_std_ratio = r3_std/r0_std;
run;

proc summary data = v_check_std_unsm; 
var r1_r0_std_ratio r3_r0_std_ratio;
output out = check_std_unsm 
min = max = /autoname;
run;

data check_std_unsm; set check_std_unsm; drop _TYPE_ _FREQ_; run;


/***************************************************************************/
**Analyze and summarize;
*Average smoothing parameters;

data v_save_smooth_coeff;
merge v_save_thetas v_save_phis;
by fundid;
dumm = 1;
run;

data v_save_smooth_coeff;
merge v_save_smooth_coeff v_save_pais;
by dumm;
run;

proc summary data = v_save_smooth_coeff;
var ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0 ma1_phi_1 ma1_ew_l0 ma1_ew_l1;
output out = save_smooth_coeff
mean =ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0 ma1_phi_1 ma1_psi_l0 ma1_psi_l1;
run;
data save_smooth_coeff; set save_smooth_coeff; drop _TYPE_ _FREQ_; run;

*Analyze;
data an00;
set s3_00;
rr = realr;
r0 = obsr;
r1 = backed_s1ret;
r3 = s3_uns_ret_fix;
keep fundid yyyymm fund_seq f1 lag1_f1 lag2_f2
rr r0 r1 r3;
run;

*return lags;
proc sort data = an00; by fundid yyyymm; run;
data an00;
set an00;
lag1_rr = lag1(rr); if fund_seq = 1 then lag1_rr = . ;
lag1_r0 = lag1(r0); if fund_seq = 1 then lag1_r0 = . ;
lag1_r1 = lag1(r1); if fund_seq = 1 then lag1_r1 = . ;
lag1_r3 = lag1(r3); if fund_seq = 1 then lag1_r3 = . ;
lag2_rr = lag2(rr); if fund_seq le 2 then lag2_rr = . ;
lag2_r0 = lag2(r0); if fund_seq le 2 then lag2_r0 = . ;
lag2_r1 = lag2(r1); if fund_seq le 2 then lag2_r1 = . ;
lag2_r3 = lag2(r3); if fund_seq le 2 then lag2_r3 = . ;
run;

*stats by fund;
proc sort data = an00; by fundid; run;
proc summary data = an00; 
var rr r0 r1 r3;
output out = s_ret_funds 
mean = std = /autoname;
by fundid; 
run;

*absolute error for std dev;
data s_ret_funds;
set s_ret_funds;
abserr_r0_std = abs(rr_stddev - r0_stddev);
abserr_r1_std = abs(rr_stddev - r1_stddev);
abserr_r3_std = abs(rr_stddev - r3_stddev);
run;

proc summary data = s_ret_funds; 
var rr_mean rr_stddev r0_mean r0_stddev r1_mean r1_stddev r3_mean r3_stddev
abserr_r0_std abserr_r1_std abserr_r3_std;
output out = avs_ret_funds
mean = av_rr_mean av_rr_std av_r0_mean av_r0_std av_r1_mean av_r1_std av_r3_mean av_r3_std
av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std;
run;

*Autocorrelations;
*Fund level;
proc sort data = an00; by fundid yyyymm; run;
proc reg data = an00 tableout outest = v_autoc_fund noprint; 
model rr = lag1_rr /edf;
model rr = lag2_rr /edf;
model r0 = lag1_r0 /edf;
model r0 = lag2_r0 /edf;
model r1 = lag1_r1 /edf;
model r1 = lag2_r1 /edf;
model r3 = lag1_r3 /edf;
model r3 = lag2_r3 /edf;
by fundid; 
run; quit;

data v_autoc_fund; set v_autoc_fund; if _TYPE_ = 'PARMS'; run;
*Average across funds;
proc summary data = v_autoc_fund;
var lag1_rr lag2_rr lag1_r0 lag2_r0 lag1_r1 lag2_r1 lag1_r3 lag2_r3;
output out = mean_autoc_fund
mean = rr_corr1 rr_corr2 r0_corr1 r0_corr2 r1_corr1 r1_corr2 r3_corr1 r3_corr2;
run;

*Aggregate level autocorrelation;
proc sort data = an00; by yyyymm; run;
proc summary data = an00; 
var rr r0 r1 r3 lag1_rr lag2_rr lag1_r0 lag2_r0 lag1_r1 lag2_r1 lag1_r3 lag2_r3;
output out = an00aggr
mean = rr r0 r1 r3 lag1_rr lag2_rr lag1_r0 lag2_r0 lag1_r1 lag2_r1 lag1_r3 lag2_r3;
by yyyymm; 
run;

proc reg data = an00aggr tableout outest = v_autoc_aggr noprint; 
model rr = lag1_rr /edf;
model rr = lag2_rr /edf;
model r0 = lag1_r0 /edf;
model r0 = lag2_r0 /edf;
model r1 = lag1_r1 /edf;
model r1 = lag2_r1 /edf;
model r3 = lag1_r3 /edf;
model r3 = lag2_r3 /edf;
run; quit;

data v_autoc_aggr; set v_autoc_aggr; if _TYPE_ = 'PARMS'; run;
proc summary data = v_autoc_aggr;
var lag1_rr lag2_rr lag1_r0 lag2_r0 lag1_r1 lag2_r1 lag1_r3 lag2_r3;
output out = mean_autoc_aggr
mean = rr_corr1 rr_corr2 r0_corr1 r0_corr2 r1_corr1 r1_corr2 r3_corr1 r3_corr2;
run;

*betas and alphas;
proc sort data = an00; by fundid yyyymm; run;
proc reg data = an00 tableout outest = v_regres_fund noprint; 
model rr = f1 /edf adjrsq;
model r0 = f1 /edf adjrsq;
model r1 = f1 /edf adjrsq;
model r3 = f1 /edf adjrsq;
model r0 = f1 lag1_f1 /edf adjrsq;
model r1 = f1 lag1_f1 /edf adjrsq;
model r3 = f1 lag1_f1 /edf adjrsq;
by fundid; 
run; quit;

*clean and get betas and alphas;
data v_regres_fund; set v_regres_fund; if _TYPE_ = 'PARMS'; run;

data wv_regres_fund_rr; *real;
set v_regres_fund;
if _MODEL_ = 'MODEL1';
rr_a = intercept*12;
rr_b = f1;
rr_rsq = _ADJRSQ_;
keep fundid rr_a rr_b rr_rsq;
run;

data wv_regres_fund_r0; *observed;
set v_regres_fund;
if _MODEL_ = 'MODEL2';
r0_a = intercept*12;
r0_b = f1;
r0_rsq = _ADJRSQ_;
keep fundid r0_a r0_b r0_rsq;
run;

data wv_regres_fund_r1; *1-step unsm;
set v_regres_fund;
if _MODEL_ = 'MODEL3';
r1_a = intercept*12;
r1_b = f1;
r1_rsq = _ADJRSQ_;
keep fundid r1_a r1_b r1_rsq;
run;

data wv_regres_fund_r3; *3-step unsm;
set v_regres_fund;
if _MODEL_ = 'MODEL4';
r3_a = intercept*12;
r3_b = f1;
r3_rsq = _ADJRSQ_;
keep fundid r3_a r3_b r3_rsq;
run;

data wv_regres_fund_r0d; *observed + Dimson;
set v_regres_fund;
if _MODEL_ = 'MODEL5';
r0d_a = intercept*12;
r0d_b = f1 + lag1_f1;
r0d_rsq = _ADJRSQ_;
keep fundid r0d_a r0d_b r0d_rsq;
run;

data wv_regres_fund_r1d; *1-step unsm + Dimson;
set v_regres_fund;
if _MODEL_ = 'MODEL6';
r1d_a = intercept*12;
r1d_b = f1 + lag1_f1;
r1d_rsq = _ADJRSQ_;
keep fundid r1d_a r1d_b r1d_rsq;
run;

data wv_regres_fund_r3d; *3-step unsm + Dimson;
set v_regres_fund;
if _MODEL_ = 'MODEL7';
r3d_a = intercept*12;
r3d_b = f1 + lag1_f1;
r3d_rsq = _ADJRSQ_;
keep fundid r3d_a r3d_b r3d_rsq;
run;


*Summarize results;
data regres_fund_coeff;
merge wv_regres_fund_rr wv_regres_fund_r0 wv_regres_fund_r1 wv_regres_fund_r3 
wv_regres_fund_r0d wv_regres_fund_r1d wv_regres_fund_r3d;
by fundid;
*absolute error for alpha;
r0_a_ae = abs(rr_a - r0_a);
r1_a_ae = abs(rr_a - r1_a);
r3_a_ae = abs(rr_a - r3_a);
r0d_a_ae = abs(rr_a - r0d_a);
r1d_a_ae = abs(rr_a - r1d_a);
r3d_a_ae = abs(rr_a - r3d_a);
*MSE for alpha;
r0_a_sqe = (rr_a - r0_a)**2;
r1_a_sqe = (rr_a - r1_a)**2;
r3_a_sqe = (rr_a - r3_a)**2;
r0d_a_sqe = (rr_a - r0d_a)**2;
r1d_a_sqe = (rr_a - r1d_a)**2;
r3d_a_sqe = (rr_a - r3d_a)**2;
*absolute error for beta;
r0_b_ae = abs(rr_b - r0_b);
r1_b_ae = abs(rr_b - r1_b);
r3_b_ae = abs(rr_b - r3_b);
r0d_b_ae = abs(rr_b - r0d_b);
r1d_b_ae = abs(rr_b - r1d_b);
r3d_b_ae = abs(rr_b - r3d_b);
run;

*average across funds;
proc summary data = regres_fund_coeff;
var rr_a r0_a r1_a r3_a r0d_a r1d_a r3d_a 
r0_a_ae r1_a_ae r3_a_ae r0d_a_ae r1d_a_ae r3d_a_ae
r0_a_sqe r1_a_sqe r3_a_sqe r0d_a_sqe r1d_a_sqe r3d_a_sqe
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_ae r1_b_ae r3_b_ae r0d_b_ae r1d_b_ae r3d_b_ae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq;
output out = mean_reg_stats
mean = rr_a r0_a r1_a r3_a r0d_a r1d_a r3d_a 
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq;
run;

*Save all stats and parameters;
data tab_sim;
set avs_ret_funds(keep=_FREQ_);
loop_i = &loopn;
nfunds = &n_funds;
tperiods = &n_periods;
pai0 = &pai0;
phi0 = &phi0;
f1_mean = &f1_mean;
f1_std = &f1_std;
alpha1_mean = &alpha1_mean;
alpha1_std = &alpha1_std;
beta1_mean = &beta1_mean;
beta1_std = &beta1_std;
err_std = &err_std;
drop _FREQ_;
run;

*add estimated coefficients;
data tab_sim;
merge tab_sim save_smooth_coeff check_std_unsm;
run;

*correlations;
*fund;
data tab_sim;
merge tab_sim mean_autoc_fund;
avf_rr_corr1 = rr_corr1;
avf_rr_corr2 = rr_corr2;
avf_r0_corr1 = r0_corr1;
avf_r0_corr2 = r0_corr2;
avf_r1_corr1 = r1_corr1;
avf_r1_corr2 = r1_corr2;
avf_r3_corr1 = r3_corr1;
avf_r3_corr2 = r3_corr2;
drop _TYPE_ _FREQ_ rr_corr1 rr_corr2
r0_corr1 r0_corr2 r1_corr1 r1_corr2 r3_corr1 r3_corr2;
run;
*aggregate;
data tab_sim;
merge tab_sim mean_autoc_aggr;
agg_rr_corr1 = rr_corr1;
agg_rr_corr2 = rr_corr2;
agg_r0_corr1 = r0_corr1;
agg_r0_corr2 = r0_corr2;
agg_r1_corr1 = r1_corr1;
agg_r1_corr2 = r1_corr2;
agg_r3_corr1 = r3_corr1;
agg_r3_corr2 = r3_corr2;
drop _TYPE_ _FREQ_ rr_corr1 rr_corr2
r0_corr1 r0_corr2 r1_corr1 r1_corr2 r3_corr1 r3_corr2;
run;

*fund standard dev;
data tab_sim;
merge tab_sim avs_ret_funds(keep=av_rr_std av_r0_std av_r1_std av_r3_std av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std);
run;

*Merge alphas and betas;
data wtab_sim_&loopn;
merge tab_sim mean_reg_stats(keep=rr_a r0_a r1_a r3_a r0d_a r1d_a r3d_a
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b  
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq);
a_mae_r3_gt_r0d_i = 0; if r3_a_mae gt r0d_a_mae then a_mae_r3_gt_r0d_i = 1;
a_msqe_r3_gt_r0d_i = 0; if r3_a_msqe gt r0d_a_msqe then a_msqe_r3_gt_r0d_i = 1;
run;

%end;


*set together;
data tab_sim_all;
set %do loopn = 1 %to &n_loops; wtab_sim_&loopn %end; ;
run;

proc summary data = tab_sim_all;
var nfunds tperiods pai0 phi0 f1_mean f1_std	
alpha1_mean	alpha1_std beta1_mean beta1_std	err_std	
ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0 ma1_phi_1 ma1_psi_l0 ma1_psi_l1
r1_r0_std_ratio_min r1_r0_std_ratio_max r3_r0_std_ratio_min r3_r0_std_ratio_max
avf_rr_corr1 avf_rr_corr2 avf_r0_corr1 avf_r0_corr2 avf_r1_corr1 avf_r1_corr2 avf_r3_corr1 avf_r3_corr2	
agg_rr_corr1 agg_rr_corr2 agg_r0_corr1 agg_r0_corr2	agg_r1_corr1 agg_r1_corr2 agg_r3_corr1 agg_r3_corr2	
av_rr_std av_r0_std	av_r1_std av_r3_std	av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std 
rr_a r0_a r1_a r3_a	r0d_a r1d_a r3d_a 
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae a_mae_r3_gt_r0d_i	
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe a_msqe_r3_gt_r0d_i
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq;
output out = av_tab_sim_all
mean = nfunds tperiods pai0 phi0 f1_mean f1_std	
alpha1_mean	alpha1_std beta1_mean beta1_std	err_std	
ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0 ma1_phi_1 ma1_psi_l0 ma1_psi_l1
r1_r0_std_ratio_min r1_r0_std_ratio_max r3_r0_std_ratio_min r3_r0_std_ratio_max
avf_rr_corr1 avf_rr_corr2 avf_r0_corr1 avf_r0_corr2 avf_r1_corr1 avf_r1_corr2 avf_r3_corr1 avf_r3_corr2	
agg_rr_corr1 agg_rr_corr2 agg_r0_corr1 agg_r0_corr2	agg_r1_corr1 agg_r1_corr2 agg_r3_corr1 agg_r3_corr2	
av_rr_std av_r0_std	av_r1_std av_r3_std	av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std 
rr_a r0_a r1_a r3_a	r0d_a r1d_a r3d_a 
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae a_mae_r3_gt_r0d_i
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe a_msqe_r3_gt_r0d_i
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq;
run;

data av_tab_sim_all_v&version;
set av_tab_sim_all;
simulation_runs = _FREQ_;
version = &version;
drop _TYPE_ _FREQ_;
run;

proc summary data = tab_sim_all;
var nfunds tperiods pai0 phi0 f1_mean f1_std	
alpha1_mean	alpha1_std beta1_mean beta1_std	err_std	
ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0 ma1_phi_1 ma1_psi_l0 ma1_psi_l1
r1_r0_std_ratio_min r1_r0_std_ratio_max r3_r0_std_ratio_min r3_r0_std_ratio_max
avf_rr_corr1 avf_rr_corr2 avf_r0_corr1 avf_r0_corr2 avf_r1_corr1 avf_r1_corr2 avf_r3_corr1 avf_r3_corr2	
agg_rr_corr1 agg_rr_corr2 agg_r0_corr1 agg_r0_corr2	agg_r1_corr1 agg_r1_corr2 agg_r3_corr1 agg_r3_corr2	
av_rr_std av_r0_std	av_r1_std av_r3_std	av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std 
rr_a r0_a r1_a r3_a	r0d_a r1d_a r3d_a 
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae	
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae
rr_rsq r0_rsq r1_rsq r3_rsq r0d_rsq r1d_rsq r3d_rsq;
output out = stats_av_tab_sim_all
min = max = /autoname;
run;

data stats_av_tab_sim_all_v&version;
set stats_av_tab_sim_all;
simulation_runs = _FREQ_;
version = &version;
drop _TYPE_ _FREQ_;
run;

%end;

%mend bigloop;
%bigloop;




****Set all together;
data all_av_tab_sim_all;
set av_tab_sim_all_v1 av_tab_sim_all_v2 av_tab_sim_all_v3;
run; 

data stats_all_av_tab_sim_all;
set stats_av_tab_sim_all_v1 stats_av_tab_sim_all_v2 stats_av_tab_sim_all_v3;
run; 

*clean;
data all_av_tab_sim_all_clean;
set all_av_tab_sim_all(keep=version simulation_runs nfunds tperiods	
f1_mean	f1_std alpha1_mean alpha1_std beta1_mean beta1_std err_std	
pai0 phi0 ma1_theta_0 ma1_theta_1 ma1_pai_0 ma1_pai_1 ma1_phi_0	ma1_phi_1 ma1_psi_l0 ma1_psi_l1	
avf_rr_corr1 avf_rr_corr2 avf_r0_corr1 avf_r0_corr2	avf_r1_corr1 avf_r1_corr2 avf_r3_corr1 avf_r3_corr2	
agg_rr_corr1 agg_rr_corr2 agg_r0_corr1 agg_r0_corr2	agg_r1_corr1 agg_r1_corr2 agg_r3_corr1 agg_r3_corr2	
av_rr_std av_r0_std	av_r1_std av_r3_std	av_abserr_r0_std av_abserr_r1_std av_abserr_r3_std	
rr_a r0_a r1_a r3_a r0d_a r1d_a r3d_a
r0_a_mae r1_a_mae r3_a_mae r0d_a_mae r1d_a_mae r3d_a_mae a_mae_r3_gt_r0d_i
r0_a_msqe r1_a_msqe r3_a_msqe r0d_a_msqe r1d_a_msqe r3d_a_msqe a_msqe_r3_gt_r0d_i
rr_b r0_b r1_b r3_b r0d_b r1d_b r3d_b
r0_b_mae r1_b_mae r3_b_mae r0d_b_mae r1d_b_mae r3d_b_mae);
*correct alphas for sampling variation;
rr_alpha = rr_a;
r0_alpha = r0_a - rr_a;
r1_alpha = r1_a - rr_a;
r3_alpha = r3_a - rr_a;
r0d_alpha = r0d_a - rr_a;
r1d_alpha = r1d_a - rr_a;
r3d_alpha = r3d_a - rr_a;
*correct betas for sampling variation;
rr_beta = rr_b;
r0_beta = r0_b - rr_b +1;
r1_beta = r1_b - rr_b +1;
r3_beta = r3_b - rr_b +1;
r0d_beta = r0d_b - rr_b +1;
r1d_beta = r1d_b - rr_b +1;
r3d_beta = r3d_b - rr_b +1;
run;


data tab_all_clean;
set all_av_tab_sim_all_clean;
sim_runs = simulation_runs;
specific = version;
phi1 = 1 - phi0;
pai1 = 1 - pai0;
std_beta = beta1_std;
std_eps = err_std;
std_f = f1_std;
r0_fund_corr1 = avf_r0_corr1;
r1_fund_corr1 = avf_r1_corr1;
r3_fund_corr1 = avf_r3_corr1;
r0_aggr_corr1 = agg_r0_corr1;
r1_aggr_corr1 = agg_r1_corr1;
r3_aggr_corr1 = agg_r3_corr1;

r0_alpha_mae = r0_a_mae;
r1_alpha_mae = r1_a_mae;
r3_alpha_mae = r3_a_mae;
r0d_alpha_mae = r0d_a_mae;
r1d_alpha_mae = r1d_a_mae;
r3d_alpha_mae = r3d_a_mae;
alpha_mae_r3_gt_r0d_i = a_mae_r3_gt_r0d_i;

r0_alpha_msqe100 = r0_a_msqe*100;
r1_alpha_msqe100 = r1_a_msqe*100;
r3_alpha_msqe100 = r3_a_msqe*100;
r0d_alpha_msqe100 = r0d_a_msqe*100;
r1d_alpha_msqe100 = r1d_a_msqe*100;
r3d_alpha_msqe100 = r3d_a_msqe*100;
alpha_msqe_r3_gt_r0d_i = a_msqe_r3_gt_r0d_i;

r0_beta_mae = r0_b_mae;
r1_beta_mae = r1_b_mae;
r3_beta_mae = r3_b_mae;
r0d_beta_mae = r0d_b_mae;
r1d_beta_mae = r1d_b_mae;
r3d_beta_mae = r3d_b_mae;

keep specific sim_runs nfunds tperiods
phi1 pai1 std_beta std_eps std_f
r0_fund_corr1 r1_fund_corr1 r3_fund_corr1 r0_aggr_corr1 r1_aggr_corr1 r3_aggr_corr1
rr_alpha r0_alpha r1_alpha r3_alpha r0d_alpha r1d_alpha r3d_alpha
r0_alpha_mae r1_alpha_mae r3_alpha_mae r0d_alpha_mae r1d_alpha_mae r3d_alpha_mae alpha_mae_r3_gt_r0d_i
r0_alpha_msqe100 r1_alpha_msqe100 r3_alpha_msqe100 r0d_alpha_msqe100 r1d_alpha_msqe100 r3d_alpha_msqe100 alpha_msqe_r3_gt_r0d_i
rr_beta r0_beta r1_beta r3_beta r0d_beta r1d_beta r3d_beta
r0_beta_mae r1_beta_mae r3_beta_mae r0d_beta_mae r1d_beta_mae r3d_beta_mae;
run;








*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
** CODE 6: SIMULATION: MA(3) WITH 8-FACTOR MODEL;
****Produces: Table 5, Panel B;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;
*********************************************************************************************************;

***********************************************************************************************************;
***********************************************************************************************************;
*** Simulation Panel B, DGP follows MA(3) with 8-factor models;
*** The Simulated Funds (i.e., N, T, empirical betas, etc) are Modeled After Relative Value Funds;


***********************************************************************************************************;
***********************************************************************************************************;
******** STEP 1 of Simulation Panel B: Generate Fund Returns;
** (step 2 is to compute the alphas of the generated fund returns using different methodologies);

*Start from 3-step unsmoothing of fund returns;
*The first part of this simulation code is just the standard 3-step unsmoothing procedure;

data hf00;
set hfautoc.hf_merge00_july2020_min5;
keep fundid_mer ret yyyymm assets_fill fund_type stra bh_fund;
run;

*lag assets;
proc sort data = hf00; by fundid_mer yyyymm; run;

data hf00;
set hf00;
lag_assets_fill = lag(assets_fill);
if fundid_mer ne lag(fundid_mer) then lag_assets_fill = .;
run;

*Attach fund classifications;
proc import out = work.Strat_manual_1
datafile = "C:\Users\&pcname.\Dropbox\Research\Hedge Funds\Unsmoothing Returns\Hedge Fund Analysis\Summer_Revision\Strat_manual_1.xlsx"
dbms =xlsx replace; getnames = yes; run;

proc sort data = hf00; by fund_type stra; run;
proc sort data = strat_manual_1; by fund_type stra; run;

data hf00;
merge hf00 strat_manual_1; 
by fund_type stra; 
categ = jkt_category;
drop jkt_category;
run;

data hf00;
set hf00;
if categ = 'Other' then delete;
aut_rank = 1;
if categ = 'Event_driven' then aut_rank = 2;
if categ = 'Multi_strategy' then aut_rank = 3;
if categ = 'Emerging_Markets' then aut_rank = 4;
if categ = 'Sector' then aut_rank = 5;
if categ = 'Long_Only' then aut_rank = 6;
if categ = 'Long_Short' then aut_rank = 7;
if categ = 'Market_Neutral' then aut_rank = 8;
if categ = 'Global Macro' then aut_rank = 9;
if categ = 'CTA' then aut_rank = 10;
if categ = 'FOF' then aut_rank = 99;
run;

data hf00; *Model simulated funds after RV category;
set hf00;
if categ = 'Relative_Value';
if yyyymm ge 199501;
if yyyymm le 201712;
run;


*attach imputed add date;
data tass_imputed_add_date1; 
set hfautoc.tass_imputed_add_date; 
fundid_mer = fundid*100 + 1;
keep fundid_mer imputed_add_date;
run;

data bh_imputed_add_date1; 
set hfautoc.bh_imputed_add_date; 
fundid_mer = fund_id*100 + 2;
keep fundid_mer imputed_add_date;
run;

data imputed_add_dates; 
set tass_imputed_add_date1 bh_imputed_add_date1; 
run;

proc sort data = hf00; by fundid_mer; run;
proc sort data = imputed_add_dates; by fundid_mer; run;

data hf00; 
merge hf00(in=a) imputed_add_dates;
by fundid_mer; 
if a;
run;

*average return by fund;
proc sort data = hf00; by fundid_mer; run;

proc summary data = hf00; 
var ret;
output out = w_mean_ret_1 mean = av_ret_fundid std = std_ret_fundid;
by fundid_mer; run;

data hf00; 
merge hf00 w_mean_ret_1;
by fundid_mer; 
fundid_mer_obs = _FREQ_;
if fundid_mer_obs ge 36;
drop _FREQ_ _TYPE_;
run;

proc sort data = hf00; by fundid_mer yyyymm; run;

data hf00; 
set hf00;
by fundid_mer;
if first.fundid_mer then fund_seq = 1;
else fund_seq + 1;
run; 

**Number of funds by category in each month;
proc sort data = hf00; by yyyymm categ; run;
proc summary data = hf00; 
var ret;
output out = w_categ_mm_n mean = av_ret_categ;
by yyyymm categ; run;

data w_count_categ_funds; set hf00; keep fundid_mer ret categ; run;
proc sort data = w_count_categ_funds nodupkey; by categ fundid_mer; run;

proc summary data = w_count_categ_funds; 
var ret;
output out = w_categ_n mean = av_ret_categ;
by categ; run;

*Now aggregate;
proc sort data = hf00; by aut_rank categ yyyymm; run;
proc summary data = hf00; 
var ret;
output out = hf00_ewret mean = ret_ew;
by aut_rank categ yyyymm; 
run;

proc sort data = hf00_ewret; by categ yyyymm; run;

/****************************************************************************************************************/
/*************************************** 3-Step Unsmoothing *******************************/
**** FIRST STEP: GET AGGREGATE ECONOMIC (UNSMOOTHED) RETURNS;

data s1ag00;
set hf00_ewret;
funds_categ_mm = _FREQ_;
keep aut_rank categ yyyymm ret_ew funds_categ_mm;
run;

*demean;
proc sort data = s1ag00; by categ; run;
proc summary data = s1ag00; 
var ret_ew funds_categ_mm;
output out = w_mean_ew_ret mean = av_aggrret_categ av_funds_categ_mm;
by categ; run;

data s1ag00;
merge s1ag00 w_mean_ew_ret;
by categ;
dem_catret_ew = ret_ew - av_aggrret_categ;
drop _TYPE_ _FREQ_ funds_categ_mm av_funds_categ_mm;
run;

*category sequence #;
proc sort data = s1ag00; by categ yyyymm; run;
data s1ag00;
set s1ag00;
by categ;
if first.categ then categ_seq = 1;
else categ_seq + 1;
run; 

*Apply MA smothing;
*MA with 3 lags;
proc sort data = s1ag00; by categ yyyymm; run;
proc arima data= s1ag00;
identify var = dem_catret_ew noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= ag_arima_ma3_est OUTSTAT=ag_arima_ma3_diag noprint;
*forecast noprint;
by categ;
run;
quit;

*Estimated PAIs for MA(3) case;
data ag_arima_ma3_est;
set ag_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_pai_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_pai_0 = 1/ma3_pai_sum;
ma3_pai_1 = -ma1_1/ma3_pai_sum;
ma3_pai_2 = -ma1_2/ma3_pai_sum;
ma3_pai_3 = -ma1_3/ma3_pai_sum;
ma3_pai_sum_norm = ma3_pai_0 + ma3_pai_1 + ma3_pai_2;
keep categ ma3_STATUS_ ma3_pai_0 ma3_pai_1 ma3_pai_2 ma3_pai_3 ma3_pai_sum;
run;

*attach thetas to main dataset;
proc sort data = ag_arima_ma3_est; by categ; run;
proc sort data = s1ag00; by categ; run;

data s1ag01;
merge s1ag00 ag_arima_ma3_est;
by categ;
run;

data s1ag01;
set s1ag01;
ma3_gt_1_5 = 0; 
if ma3_pai_0 gt 1.25 or ma3_pai_1 gt 1.25 or ma3_pai_2 gt 1.25 or ma3_pai_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_pai_0 le -0.45 or ma3_pai_1 le -0.45 or ma3_pai_2 le -0.45 or ma3_pai_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_pai_0 = 1; sel_pai_1 = 0; sel_pai_2 = 0; sel_pai_3 = 0;
sel_pai_0 = ma3_pai_0;
sel_pai_1 = ma3_pai_1;
sel_pai_2 = ma3_pai_2;
sel_pai_3 = ma3_pai_3;
if ma3_gt_1_5 = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
if ma3_status = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
run;

*Aggregate MA coefficients (pais);
data s1_ag_pais; set s1ag01; keep aut_rank categ sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3; run;
proc sort data = s1_ag_pais nodupkey; by aut_rank categ; run;


*back out estimated returns for aggregate returns;
proc sort data = s1ag01; by categ yyyymm; run;

%macro back_out_catrets;

data s1ag01_loop;
set s1ag01;
backed_catret_aic = dem_catret_ew;
keep aut_rank categ categ_seq backed_catret_aic dem_catret_ew yyyymm av_aggrret_categ
sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;

%do i = 4 %to 276; *MA3;

data s1ag01_loop;
set s1ag01_loop;
lag1_backed_catret_aic = lag1(backed_catret_aic);
lag2_backed_catret_aic = lag2(backed_catret_aic);
lag3_backed_catret_aic = lag3(backed_catret_aic);

if categ_seq = &i then backed_catret_aic = (dem_catret_ew - sel_pai_1*lag1_backed_catret_aic - sel_pai_2*lag2_backed_catret_aic - sel_pai_3*lag3_backed_catret_aic)/sel_pai_0;
run;

%end;

%mend back_out_catrets;
%back_out_catrets;

*check average backed ret by category;
proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop mean = /autoname;
by categ;
run;

*make sure residuals by each category sum to 0;
data s1ag01_loop; 
merge s1ag01_loop check_av_s1ag01_loop;
by categ; 
temp_backed_catret_aic = backed_catret_aic;
drop backed_catret_aic;
run;

data s1ag01_loop; 
set s1ag01_loop; 
backed_catret_aic = temp_backed_catret_aic - backed_catret_aic_mean;
drop backed_catret_aic_mean temp_backed_catret_aic _FREQ_ _TYPE_;
run;

proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop_v2 mean = /autoname;
by categ;
run;

*clean dataset and add back mean;
data s1ag01_loop;
set s1ag01_loop;
dem_backed_catret_aic = backed_catret_aic;
backed_catret_aic = backed_catret_aic + av_aggrret_categ;
keep categ yyyymm backed_catret_aic dem_backed_catret_aic;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;
proc sort data = s1ag01; by categ yyyymm; run;

data s1ag02;
merge s1ag01 s1ag01_loop; 
by categ yyyymm; 
run;

/*********************************************************************/
*Continue to step 2;
**** SECOND STEP: GET FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);

data s2fund00; *first get fund-level data;
set hf00;
keep ret yyyymm fundid_mer categ fund_seq av_ret_fundid lag_assets_fill aut_rank;
run;

data w_au_ewcatert;
set s1ag02;
catret_ew = ret_ew;
ag_aic_adj1_win = ag_aic_adj1_win;
keep categ yyyymm catret_ew backed_catret_aic ag_aic_adj1_win;
run;

proc sort data = w_au_ewcatert; by categ yyyymm; run;
data w_au_ewcatert;
set w_au_ewcatert;
*1 lag;
lag1_backed_catret_aic = lag1(backed_catret_aic);
if categ ne lag1(categ) then lag1_backed_catret_aic = . ;
*2 lag;
lag2_backed_catret_aic = lag2(backed_catret_aic);
if categ ne lag2(categ) then lag2_backed_catret_aic = . ;
*3 lag;
lag3_backed_catret_aic = lag3(backed_catret_aic);
if categ ne lag3(categ) then lag3_backed_catret_aic = . ;
run;

*attach category-month returns and demean;
proc sort data = s2fund00; by categ yyyymm; run;
proc sort data = w_au_ewcatert; by categ yyyymm; run;

data s2fund00;
merge s2fund00 w_au_ewcatert; 
by categ yyyymm; 
ret_excat = ret - catret_ew;
run;

*get fund's average return in excess of the category return;
*also demean unsmoothed EW categ return;
proc sort data = s2fund00; by fundid_mer; run;
proc summary data = s2fund00; 
var ret_excat backed_catret_aic
lag1_backed_catret_aic
lag2_backed_catret_aic
lag3_backed_catret_aic;
output out = w_mean_excat_ret 
mean = av_retexcat_fundid av_backed_catret_aic_fund
av_lag1_backed_catret_aic_fund
av_lag2_backed_catret_aic_fund
av_lag3_backed_catret_aic_fund;
by fundid_mer; 
run;

data s2fund00;
merge s2fund00 w_mean_excat_ret;
by fundid_mer; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_catret_aic = backed_catret_aic - av_backed_catret_aic_fund;
dem_lag1_backed_catret_aic = lag1_backed_catret_aic - av_lag1_backed_catret_aic_fund;
dem_lag2_backed_catret_aic = lag2_backed_catret_aic - av_lag2_backed_catret_aic_fund;
dem_lag3_backed_catret_aic = lag3_backed_catret_aic - av_lag3_backed_catret_aic_fund;
run;

*check average dem_ret_excat;
proc summary data = s2fund00;
var dem_ret_excat dem_backed_catret_aic dem_lag1_backed_catret_aic;
output out = check_avg_dem_ret_excat mean = /autoname;
by fundid_mer; 
run;


*Use ARIMA on returns in excess of category returns;
*MA with 3 lags;
proc sort data = s2fund00; by fundid_mer yyyymm; run;
proc arima data= s2fund00;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est OUTSTAT= excat_arima_ma3_diag noprint;
by fundid_mer;
run;
quit;

*MA3;
data excat_arima_ma3_est;
set excat_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_phi_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_phi_0 = 1/ma3_phi_sum;
ma3_phi_1 = -ma1_1/ma3_phi_sum;
ma3_phi_2 = -ma1_2/ma3_phi_sum;
ma3_phi_3 = -ma1_3/ma3_phi_sum;
ma3_phi_sum_norm = ma3_phi_0 + ma3_phi_1 + ma3_phi_2;
keep fundid_mer ma3_STATUS_ ma3_phi_0 ma3_phi_1 ma3_phi_2 ma3_phi_3 ma3_phi_sum;
run;

*attach thetas to main dataset;
proc sort data = s2fund00; by fundid_mer; run;
proc sort data = excat_arima_ma3_est; by fundid_mer; run;

data s2fund01;
merge s2fund00 excat_arima_ma3_est;
by fundid_mer;
run;

data s2fund01;
set s2fund01;
ma3_gt_1_5 = 0; 
if ma3_phi_0 gt 1.25 or ma3_phi_1 gt 1.25 or ma3_phi_2 gt 1.25 or ma3_phi_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_phi_0 le -0.45 or ma3_phi_1 le -0.45 or ma3_phi_2 le -0.45 or ma3_phi_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_phi_0 = ma3_phi_0; 
sel_phi_1 = ma3_phi_1; 
sel_phi_2 = ma3_phi_2; 
sel_phi_3 = ma3_phi_3;
if ma3_gt_1_5 = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
if ma3_status = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
run;


***Aggregate MA coefficients (pais);
data w_s2_res_phis; set s2fund01; keep aut_rank categ fundid_mer sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3; run;
proc sort data = w_s2_res_phis nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis
mean = sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
by aut_rank categ; 
run;

*check outliers min max;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_minmax
min = max = /autoname;
by aut_rank categ; 
run;

*check outliers percentiles;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_p1p99
p1 = p99 = /autoname;
by aut_rank categ; 
run;

data tab_ma_coeff_all;
merge s1_ag_pais s2_res_phis;
by aut_rank categ; 
drop _TYPE_ _FREQ_;
run;


*back out estimated excess returns;
proc sort data = s2fund01; by fundid_mer yyyymm; run;

%macro back_out_exrets;

data s2fund01_loop;
set s2fund01;
backed_ret_excat_aic = dem_ret_excat;
keep backed_ret_excat_aic dem_ret_excat fundid_mer yyyymm fund_seq av_ret_fundid
sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3 av_retexcat_fundid;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;

%do i = 4 %to 276; *MA3;

data s2fund01_loop;
set s2fund01_loop;
lag1_backed_ret_excat_aic = lag1(backed_ret_excat_aic);
lag2_backed_ret_excat_aic = lag2(backed_ret_excat_aic);
lag3_backed_ret_excat_aic = lag3(backed_ret_excat_aic);

if fund_seq = &i then backed_ret_excat_aic = (dem_ret_excat - sel_phi_1*lag1_backed_ret_excat_aic - sel_phi_2*lag2_backed_ret_excat_aic - sel_phi_3*lag3_backed_ret_excat_aic)/sel_phi_0;
run;

%end;

%mend back_out_exrets;
%back_out_exrets;


*clean dataset and keep adjusted residual;
data s2fund01_loop;
set s2fund01_loop;
res_backed_ret_excat_aic = backed_ret_excat_aic;
keep fundid_mer yyyymm res_backed_ret_excat_aic;
run;

proc sort data = s2fund01_loop; by fundid_mer yyyymm; run;
proc sort data = s2fund01; by fundid_mer yyyymm; run;

data s2fund02;
merge s2fund01 s2fund01_loop; 
by fundid_mer yyyymm; 
run;


***check if residuals from this step have mean = 0;

proc summary data = s2fund02;
var dem_ret_excat res_backed_ret_excat_aic;
output out = check_step2_mean_excatret mean = /autoname;
by fundid_mer; 
run;

**** STEP 3 : Add up residual from aggregate MA and excess ret MA;

*get aggregate MA residual;
data s1_resid;
set s1ag02;
keep categ yyyymm av_aggrret_categ 
backed_catret_aic dem_backed_catret_aic;
run;

*get fund-level cat-excess return residual;
data s2_resid;
set s2fund02;
keep fundid_mer yyyymm dem_ret_excat res_backed_ret_excat_aic;
run;

*attach to main dataset;
proc sort data = hf00; by categ yyyymm; run;
proc sort data = s1_resid; by categ yyyymm; run;

data s3_00;
merge hf00 s1_resid; 
by categ yyyymm; 
run;

proc sort data = s3_00; by fundid_mer yyyymm; run;
proc sort data = s2_resid; by fundid_mer yyyymm; run;

data s3_00;
merge s3_00 s2_resid; 
by fundid_mer yyyymm;
run;

*Now calculate unsmoothed return with 3-step method;
data s3_00;
set s3_00;
s3_uns_ret_aic_temp = dem_backed_catret_aic + res_backed_ret_excat_aic + av_ret_fundid;
run;

proc sort data = s3_00; by fundid_mer; run;
proc summary data = s3_00; 
var s3_uns_ret_aic_temp av_ret_fundid;
output out = w_adj_s3_uns mean = /autoname;
by fundid_mer; 
run;

data s3_00;
merge s3_00 w_adj_s3_uns;
by fundid_mer; 
s3_uns_ret_aic = s3_uns_ret_aic_temp - s3_uns_ret_aic_temp_mean + av_ret_fundid_mean;
drop s3_uns_ret_aic_temp_mean av_ret_fundid_mean _TYPE_ _FREQ_;
run;

*As a final check, check averages;
proc sort data = s3_00; by categ; run;
proc summary data = s3_00;
var ret av_ret_fundid s3_uns_ret_aic 
dem_backed_catret_aic res_backed_ret_excat_aic;
output out = check_avret_s3_00_all mean = /autoname;
by categ;
run;

*********************************************************************************************************************;
/********************************************************************************************************************/

**** Regular unsmoothing code till here;
**** Now start code for simulations;

*********************************************************************************************************************;
/********************************************************************************************************************/
***** Obtain beta distribution based on 3-step unsmoothed returns;

data s3_01;
set s3_00;
s3ret = s3_uns_ret_aic;
keep yyyymm fundid_mer categ s3ret
fundid_mer_obs fund_seq imputed_add_date;
run;

** Read HF factors;
data hffact;
set hfautoc.hf_factors;
fh1 = sp500_rf;
fh2 = size_spread;
fh3 = emerg_mkt_rf;
fh4 = FS_bond_mkt;
fh5 = FS_credit_sprd;
fh6 = PTFSBD;
fh7 = PTFSFX;
fh8 = PTFSCOM;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 rf;
run;

*lag factors;
proc sort data = hffact; by yyyymm; run;

data hffact;
set hffact;
*1 lag;
l1_fh1 = lag1(fh1);
l1_fh2 = lag1(fh2);
l1_fh3 = lag1(fh3);
l1_fh4 = lag1(fh4);
l1_fh5 = lag1(fh5);
l1_fh6 = lag1(fh6);
l1_fh7 = lag1(fh7);
l1_fh8 = lag1(fh8);
*2 lags;
l2_fh1 = lag2(fh1);
l2_fh2 = lag2(fh2);
l2_fh3 = lag2(fh3);
l2_fh4 = lag2(fh4);
l2_fh5 = lag2(fh5);
l2_fh6 = lag2(fh6);
l2_fh7 = lag2(fh7);
l2_fh8 = lag2(fh8);
*3 lags;
l3_fh1 = lag3(fh1);
l3_fh2 = lag3(fh2);
l3_fh3 = lag3(fh3);
l3_fh4 = lag3(fh4);
l3_fh5 = lag3(fh5);
l3_fh6 = lag3(fh6);
l3_fh7 = lag3(fh7);
l3_fh8 = lag3(fh8);
run;

*merge;
proc sort data = s3_01; by yyyymm; run;
proc sort data = hffact; by yyyymm; run;

data s3_01; 
merge s3_01(in=a) hffact;
by yyyymm; 
if a;
s3retrt = s3ret - rf;
run;

*Run regression to obtain 3-step betas;
proc sort data = s3_01; by fundid_mer yyyymm; run;
proc reg data = s3_01 outest = v_s3_fund_fh00 noprint tableout;
model s3retrt = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
by fundid_mer; 
run; quit;

data v_s3_fund_fh00_coef_r3;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
r3_rmse = _RMSE_;
r3_fh_int = intercept;
r3_fh1 = fh1;
r3_fh2 = fh2;
r3_fh3 = fh3;
r3_fh4 = fh4;
r3_fh5 = fh5;
r3_fh6 = fh6;
r3_fh7 = fh7;
r3_fh8 = fh8;
dumm = 1;
keep fundid_mer dumm r3_fh_int
r3_rmse r3_fh1 r3_fh2 
r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8;
run;

*Unable to match sample stats unless betas are winsorized slightly;
*Winsorize betas;
proc univariate data = v_s3_fund_fh00_coef_r3 noprint;
var r3_fh1 r3_fh2 r3_fh3 r3_fh4 
r3_fh5 r3_fh6 r3_fh7 r3_fh8;
output out = w_wins_betas1 pctlpts = 2.5 97.5
pctlpre = r3_fh1_p r3_fh2_p r3_fh3_p r3_fh4_p 
r3_fh5_p r3_fh6_p r3_fh7_p r3_fh8_p;
by dumm;
run;

data v_s3_fund_fh00_coef_r3;
merge v_s3_fund_fh00_coef_r3 w_wins_betas1;
by dumm;
*winsorize;
r3_fh1w = min(max(r3_fh1,r3_fh1_p2_5),r3_fh1_p97_5);
r3_fh2w = min(max(r3_fh2,r3_fh2_p2_5),r3_fh2_p97_5);
r3_fh3w = min(max(r3_fh3,r3_fh3_p2_5),r3_fh3_p97_5);
r3_fh4w = min(max(r3_fh4,r3_fh4_p2_5),r3_fh4_p97_5);
r3_fh5w = min(max(r3_fh5,r3_fh5_p2_5),r3_fh5_p97_5);
r3_fh6w = min(max(r3_fh6,r3_fh6_p2_5),r3_fh6_p97_5);
r3_fh7w = min(max(r3_fh7,r3_fh7_p2_5),r3_fh7_p97_5);
r3_fh8w = min(max(r3_fh8,r3_fh8_p2_5),r3_fh8_p97_5);
r3_fh1 = r3_fh1w;
r3_fh2 = r3_fh2w;
r3_fh3 = r3_fh3w;
r3_fh4 = r3_fh4w;
r3_fh5 = r3_fh5w;
r3_fh6 = r3_fh6w;
r3_fh7 = r3_fh7w;
r3_fh8 = r3_fh8w;
keep fundid_mer dumm r3_fh_int
r3_rmse r3_fh1 r3_fh2 
r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8;
run;


**********************************************************;
*Now bootstrap fund-level betas and smoothing coefficients;
*Prepare datasets before bootstrapping procedure begins;

%let err_std = 0.022; *use 2.2%;
%let bigt  = 85; *T for fund-series length;
%let totsimuln = 100;


data boot_fund_list_number;
set v_s3_fund_fh00_coef_r3;
boot_seq + 1;
keep fundid_mer boot_seq;
run;

data boot_betas; *betas, with replacement;
set v_s3_fund_fh00_coef_r3 v_s3_fund_fh00_coef_r3 v_s3_fund_fh00_coef_r3
v_s3_fund_fh00_coef_r3 v_s3_fund_fh00_coef_r3 v_s3_fund_fh00_coef_r3;
keep r3_fh1 r3_fh2 r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8 r3_rmse;
run;

data boot_phis; *phis, with replacement;
set w_s2_res_phis w_s2_res_phis w_s2_res_phis w_s2_res_phis
w_s2_res_phis w_s2_res_phis w_s2_res_phis w_s2_res_phis;
dumm = 1;
keep sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3 dumm;
run;

data boot_pais; *pais, same for all funds;
set s1_ag_pais;
dumm = 1;
keep sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3 dumm;
run;

data boot_phis; *add pais to phis;
merge boot_phis boot_pais;
by dumm;
run;

*Make basic panel with fund-month obs and factor returns;

data w_boot_fundstart; *basic panel with fund-month obs and factor returns;
set s3_01;
if yyyymm le 201501; *require at least 3 years;
if fund_seq = 1;
fund_start_yyyymm = yyyymm;
dumsql = 1;
keep fund_start_yyyymm fundid_mer dumsql;
run;

*Draw factors - First need to obtain mean and STD of factor returns;
data w_fhf;
set hffact;
dumfh = 1;
if yyyymm ge 199410;
if yyyymm le 201712;
keep yyyymm fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 dumfh;
run;
proc sort data = w_fhf nodupkey; by yyyymm; run;
proc summary data = w_fhf;
var fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8;
output out = fhf_stats
mean = std = /autoname;
by dumfh;
run;

*frame for drawing factors;
data w_fhf_frame;
set w_fhf;
keep yyyymm dumfh;
run;

data wfact_draw_frame;
merge w_fhf_frame fhf_stats;
by dumfh;
drop _TYPE_ _FREQ_;
run;


*Make factors;

%macro draw_facts;

%let factor_draws = 100;

%do simuln = 1 %to &totsimuln;

%do fi = 1 %to 8;

%do i = 1 %to &factor_draws;


data wfact_draw&i;
set wfact_draw_frame;
call streaminit(%eval(&simuln*10000 + &fi*1000 + &i));
fhf = RAND('NORMAL',fh&fi._mean,fh&fi._stddev);
lag1_fhf = lag1(fhf);
lag2_fhf = lag2(fhf);
lag3_fhf = lag3(fhf);
fact_i = &i;
keep yyyymm fhf lag1_fhf lag2_fhf lag3_fhf dumfh fact_i;
run;

%end;

data wfact_draw_all;
set %do i = 1 %to &factor_draws; wfact_draw&i %end; ;
run;

proc corr data = wfact_draw_all out = wfact_draw_corr(where=(_TYPE_="CORR")) noprint;
var fhf;
with lag1_fhf lag2_fhf lag3_fhf;
by fact_i;
run;

data wfact_draw_corr1; set wfact_draw_corr; if _NAME_ = 'lag1_fhf'; fcorr1 = fhf; keep fact_i fcorr1; run;
data wfact_draw_corr2; set wfact_draw_corr; if _NAME_ = 'lag2_fhf'; fcorr2 = fhf; keep fact_i fcorr2; run;
data wfact_draw_corr3; set wfact_draw_corr; if _NAME_ = 'lag3_fhf'; fcorr3 = fhf; keep fact_i fcorr3; run;

data wfact_draw_corr123; 
merge wfact_draw_corr1 wfact_draw_corr2 wfact_draw_corr3; 
fcorr_abssum = 1*abs(fcorr1) + 0.6*abs(fcorr2) + 0.36*abs(fcorr3);
run;

proc sort data =  wfact_draw_corr123; by fcorr_abssum; run;
data wfact_draw_corr123_keep_f&fi; 
set wfact_draw_corr123; 
call streaminit(%eval(&simuln*10000 + &fi*1000 + &i));
if _n_ le 4;
u1 = rand("Uniform");
run;

proc sort data =  wfact_draw_corr123_keep_f&fi; by u1; run;
data wfact_draw_corr123_keep_f&fi; 
set wfact_draw_corr123_keep_f&fi; 
if _n_ = 1;
keep_factor_i = 1;
run;

proc sort data = wfact_draw_corr123_keep_f&fi; by fact_i; run;
proc sort data = wfact_draw_all; by fact_i; run;

data wfact_draw_all_select; 
merge wfact_draw_all(in=a) wfact_draw_corr123_keep_f&fi;
by fact_i; 
if a;
if keep_factor_i = 1;
run;


data fact_draw_f&fi; *clean;
set wfact_draw_all_select; 
fh&fi = fhf;
l1_fh&fi = lag1_fhf;
l2_fh&fi = lag2_fhf;
l3_fh&fi = lag3_fhf;
drop fact_i keep_factor_i 
fhf lag1_fhf lag2_fhf lag3_fhf
fcorr1 fcorr2 fcorr3 fcorr_abssum u1;
run;

*keep stats regarding drawn factor;
data wdrawn_factor_f&fi;
set wfact_draw_all_select; 
run;

%end;



*Merge the 8 drawn factors together;
data fact_draw_all;
merge fact_draw_f1 fact_draw_f2 fact_draw_f3 fact_draw_f4 
fact_draw_f5 fact_draw_f6 fact_draw_f7 fact_draw_f8;
by yyyymm;
if yyyymm ge 199501;
run;

*stats of drawn factors;
proc summary data = fact_draw_all;
var fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8;
output out = fact_draw_mean_std
mean = std = /autoname;
run;


*Make the frame with funds and factor returns;
proc sql; create table boot_panel_frame as select * 
from w_boot_fundstart, fact_draw_all
where w_boot_fundstart.dumsql = fact_draw_all.dumfh;
quit;

proc sort data = boot_panel_frame nodupkey; by fundid_mer yyyymm; run;

data boot_panel_frame;
set boot_panel_frame;
yyyy = round(yyyymm/100,1);
mm = yyyymm - yyyy*100;
fund_start_yyyy = round(fund_start_yyyymm/100,1);
fund_start_mm = fund_start_yyyymm - fund_start_yyyy*100;
reco_fund_seq = (yyyy-fund_start_yyyy)*12 + (mm-fund_start_mm) + 1;
if reco_fund_seq ge 1;
if reco_fund_seq le (&bigt + 3);
fund_seq = reco_fund_seq;
drop yyyy mm fund_start_yyyy fund_start_mm;
run;

proc sort data = boot_panel_frame; by fundid_mer yyyymm; run;
proc summary data = boot_panel_frame; 
var reco_fund_seq yyyymm;
output out = v_boot_panel_frame_dates
min = max = /autoname;
by fundid_mer;
run;


********************************************;
*Simulate panel of funds;
*Bootstrap betas;
data boot_betas_select;
set boot_betas;
call streaminit(&simuln);
u_betas = rand("Uniform");
run;
proc sort data = boot_betas_select; by u_betas; run;

*Bootstrap phi coefficients;
data boot_phis_select;
set boot_phis;
call streaminit(%eval(&simuln +10));
u_phis = rand("Uniform");
run;
proc sort data = boot_phis_select; by u_phis; run;

*Add to fund list - no need to specify merging variable;
*This step cuts down number of bootstrapped funds to 670;
data boot_fund_list_number_select;
merge boot_fund_list_number(in=a) boot_betas_select boot_phis_select;
if a;
run;

*Add to panel - merge by fundid_mer; 
proc sort data = boot_panel_frame; by fundid_mer; run;
proc sort data = boot_fund_list_number_select; by fundid_mer; run;

data boot_panel_select;
merge boot_panel_frame(in=a) boot_fund_list_number_select; 
by fundid_mer; 
if a;
run;

*Generate economic fund returns;
data boot_panel_select;
set boot_panel_select;
call streaminit(%eval(&simuln +20));
mm_err = RAND('NORMAL',0,&err_std);
eret = r3_fh1*fh1 +r3_fh2*fh2 +r3_fh3*fh3 +r3_fh4*fh4
+r3_fh5*fh5 +r3_fh6*fh6 +r3_fh7*fh7 +r3_fh8*fh8 + mm_err;
run;

proc summary data = boot_panel_select;
var eret;
output out = v_eret_stats
mean = std = /autoname;
by fundid_mer;
run;

*get lags;
proc sort data = boot_panel_select; by fundid_mer yyyymm; run;
data boot_panel_select;
set boot_panel_select;
lag1_eret = lag1(eret); if fund_seq = 1 then lag1_eret = .;
lag2_eret = lag2(eret); if fund_seq le 2 then lag2_eret = .;
lag3_eret = lag3(eret); if fund_seq le 3 then lag3_eret = .;
run;

************************************;
* Now need to "smooth" the simulated economic fund returns;
*First compute aggregate return;

proc sort data = boot_panel_select; by yyyymm; run;
proc summary data = boot_panel_select; 
var eret;
output out = v_boot_panel_select_ew
mean = eret_ew;
by yyyymm; 
run;

data v_boot_panel_select_ew;
set v_boot_panel_select_ew;
lag1_eret_ew = lag1(eret_ew);
lag2_eret_ew = lag2(eret_ew);
lag3_eret_ew = lag3(eret_ew);
drop _TYPE_ _FREQ_;
run;

data boot_panel_select;
merge boot_panel_select v_boot_panel_select_ew;
by yyyymm; 
ex_eret = eret - eret_ew;
lag1_ex_eret = lag1_eret - lag1_eret_ew;
lag2_ex_eret = lag2_eret - lag2_eret_ew;
lag3_ex_eret = lag3_eret - lag3_eret_ew;
run;

*Smoothed (observed) return;
data boot_panel_select;
set boot_panel_select;
oret = sel_pai_0*eret_ew +sel_pai_1*lag1_eret_ew +sel_pai_2*lag2_eret_ew +sel_pai_3*lag3_eret_ew
+sel_phi_0*ex_eret +sel_phi_1*lag1_ex_eret +sel_phi_2*lag2_ex_eret +sel_phi_3*lag3_ex_eret;
run;

*Drop first 3 observations, then run regressions;
data boot_panel_gt3;
set boot_panel_select;
if fund_seq gt 3;
run;

*Run regression using simulated returns;
proc sort data = boot_panel_gt3; by fundid_mer yyyymm; run;
proc reg data = boot_panel_gt3 outest = v_boot_panel_reg1 noprint tableout;
*economic return;
model eret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model eret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model eret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8/edf ADJRSQ;
model eret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8/edf ADJRSQ;
*observed return;
model oret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model oret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model oret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8/edf ADJRSQ;
model oret = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8/edf ADJRSQ;
by fundid_mer; 
run; quit;

data v_boot_panel_reg1;
set v_boot_panel_reg1;
reg_obs = _P_ + _EDF_;
if _TYPE_ = "PARMS";
alp = Intercept;
f1d_sum = sum(fh1,l1_fh1,l2_fh1,l3_fh1);
f2d_sum = sum(fh2,l1_fh2,l2_fh2,l3_fh2);
f3d_sum = sum(fh3,l1_fh3,l2_fh3,l3_fh3);
f4d_sum = sum(fh4,l1_fh4,l2_fh4,l3_fh4);
f5d_sum = sum(fh5,l1_fh5,l2_fh5,l3_fh5);
f6d_sum = sum(fh6,l1_fh6,l2_fh6,l3_fh6);
f7d_sum = sum(fh7,l1_fh7,l2_fh7,l3_fh7);
f8d_sum = sum(fh8,l1_fh8,l2_fh8,l3_fh8);
if reg_obs lt 45 then do;
f1d_sum = sum(fh1,l1_fh1,l2_fh1);
f2d_sum = sum(fh2,l1_fh2,l2_fh2);
f3d_sum = sum(fh3,l1_fh3,l2_fh3);
f4d_sum = sum(fh4,l1_fh4,l2_fh4);
f5d_sum = sum(fh5,l1_fh5,l2_fh5);
f6d_sum = sum(fh6,l1_fh6,l2_fh6);
f7d_sum = sum(fh7,l1_fh7,l2_fh7);
f8d_sum = sum(fh8,l1_fh8,l2_fh8);
end;
run;

proc sort data = v_boot_panel_reg1; by _MODEL_ _DEPVAR_; run;
proc summary data = v_boot_panel_reg1; 
var alp f1d_sum f2d_sum f3d_sum f4d_sum
f5d_sum f6d_sum f7d_sum f8d_sum;
output out = v_boot_panel_reg1_mean
mean = std = /autoname;
by _MODEL_ _DEPVAR_; run;

*Use real alpha (eret intercept) to re-set alphas to zero;
data v_boot_panel_reg1_orig_alp;
set v_boot_panel_reg1;
orig_alp = intercept;
if _MODEL_ = "MODEL1";
keep fundid_mer orig_alp;
run;

proc sort data = boot_panel_gt3; by fundid_mer; run;
proc sort data = v_boot_panel_reg1_orig_alp; by fundid_mer; run;
data boot_panel_gt3;
merge boot_panel_gt3(in=a) v_boot_panel_reg1_orig_alp; 
by fundid_mer; 
eret_adj = eret - orig_alp;
oret_adj = oret - orig_alp;
run;

*Summary stats for simulated returns;
*To compare to actual fund returns;
*Note: alpha has been set to 0;
proc sort data = boot_panel_gt3; by fundid_mer; run;
proc summary data = boot_panel_gt3; 
var eret_adj oret_adj;
output out = v_simulret_byfund
mean = std = /autoname;
by fundid_mer; run;
proc summary data = v_simulret_byfund; 
var eret_adj_mean eret_adj_stddev oret_adj_mean oret_adj_stddev _FREQ_;
output out = v_simulret_stats
mean = eret_mean eret_stddev oret_mean oret_stddev avg_t;
run;

data v_simulret_stats;
set v_simulret_stats;
eret_mean_ann = eret_mean*12;
eret_stddev_ann = eret_stddev*sqrt(12); 
oret_mean_ann = oret_mean*12; 
oret_stddev_ann = oret_stddev*sqrt(12);
run;

*Save bootstrapped panel, with factors too;
data boot_funds_v2rv_&simuln;
set boot_panel_gt3;
categ = &simuln;
aut_rank = &simuln;
fund_seq = reco_fund_seq - 3; *reset;
keep fundid_mer yyyymm fund_seq categ aut_rank
eret_adj oret_adj
fh1--l3_fh8;
run;

%end;

%mend draw_facts;
%draw_facts;



***********************************************************************************************************;
***********************************************************************************************************;

****** Proceed to step 2 of simulation: analyze the bootstrapped fund returns;
**** Read Data Simulated for Panel B;
**** Factors are also simulated (inside dataset);
*** Coding note to self: "categ" in the input data refers to the simulation run #;


%macro ODSOff(); 
ods graphics off;
ods exclude all;
ods noresults;
%mend;
 
%macro ODSOn();
ods graphics on;
ods exclude none;
ods results;
%mend;


***Rename dataset with bootstrapped data;
data vread_hf00;
set boot_funds_v2rv;
ret = oret_adj;
run;

data vread_hf00;
set vread_hf00;
if yyyymm ge 199501;
if yyyymm le 201712;
run;

*split into funds and factors;
data hf00; *Funds;
set vread_hf00;
keep fundid_mer yyyymm ret fund_seq categ aut_rank;
run;

data vread_factor00; *Factors;
set vread_hf00;
keep fundid_mer yyyymm categ aut_rank fh1--l3_fh8;
run;

*average return by fund;
proc sort data = hf00; by categ fundid_mer; run;

proc summary data = hf00; 
var ret;
output out = w_mean_ret_1 mean = av_ret_fundid std = std_ret_fundid;
by categ fundid_mer; run;

data hf00; 
merge hf00 w_mean_ret_1;
by categ fundid_mer; 
fundid_mer_obs = _FREQ_;
if fundid_mer_obs ge 36;
drop _FREQ_ _TYPE_;
run;

*number of funds by category in each month;
proc sort data = hf00; by yyyymm categ; run;
proc summary data = hf00; 
var ret;
output out = w_categ_mm_n mean = av_ret_categ;
by yyyymm categ; run;

data w_count_categ_funds; set hf00; keep fundid_mer ret categ; run;
proc sort data = w_count_categ_funds nodupkey; by categ fundid_mer; run;

proc summary data = w_count_categ_funds; 
var ret;
output out = w_categ_n mean = av_ret_categ;
by categ; run;


/******************************************************************************************************************/
**** Part 1: normal MA method as in GLM(2004);

*start by de-meaning fund return;
data hf00; 
set hf00; 
dem_ret = ret - av_ret_fundid;
run;

*MA with 3 lags;
proc sort data = hf00; by categ fundid_mer yyyymm; run;
proc arima data= hf00;
identify var = dem_ret noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= arima_ma3_est OUTSTAT=arima_ma3_diag noprint;
*forecast noprint;
by categ fundid_mer;
run;
quit;

data arima_ma3_est;
set arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_theta_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_theta_0 = 1/ma3_theta_sum;
ma3_theta_1 = -ma1_1/ma3_theta_sum;
ma3_theta_2 = -ma1_2/ma3_theta_sum;
ma3_theta_3 = -ma1_3/ma3_theta_sum;
ma3_theta_sum_norm = ma3_theta_0 + ma3_theta_1 + ma3_theta_2 + ma3_theta_3;
keep categ fundid_mer ma3_STATUS_ ma3_theta_0 ma3_theta_1 ma3_theta_2 ma3_theta_3 ma3_theta_sum ma3_theta_sum_norm;
run;

*attach thetas to main dataset;
data hf01;
merge hf00 arima_ma3_est;
by categ fundid_mer;
run;

data hf01;
set hf01;
ma3_gt_1_5 = 0; 
if ma3_theta_0 gt 1.25 or ma3_theta_1 gt 1.25 or ma3_theta_2 gt 1.25 or ma3_theta_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_theta_0 le -0.45 or ma3_theta_1 le -0.45 or ma3_theta_2 le -0.45 or ma3_theta_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_theta_0 = ma3_theta_0; 
sel_theta_1 = ma3_theta_1; 
sel_theta_2 = ma3_theta_2; 
sel_theta_3 = ma3_theta_3;
if ma3_gt_1_5 = 1 then do;
sel_theta_0 = 1; 
sel_theta_1 = 0; 
sel_theta_2 = 0; 
sel_theta_3 = 0;
end;
if ma3_status = 1 then do;
sel_theta_0 = 1; 
sel_theta_1 = 0; 
sel_theta_2 = 0; 
sel_theta_3 = 0;
end;
run;

data v_check_thetas; set hf01; run;
proc sort data = v_check_thetas nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = v_check_thetas; 
var sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
output out = v_av_thetas
mean = sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
by aut_rank categ; 
run;

/********************************************************/
*back out estimated returns;
*Use MA(3);

proc sort data = hf01; by categ fundid_mer yyyymm; run;

%macro back_out_rets;

data hf01_loop;
set hf01;
backed_ret_aic = dem_ret;
keep backed_ret_aic categ
dem_ret fundid_mer yyyymm fund_seq av_ret_fundid
sel_theta_0 sel_theta_1 sel_theta_2 sel_theta_3;
run;

proc sort data = hf01_loop; by categ fundid_mer yyyymm; run;

%do i = 4 %to 276; *MA3 loop;

data hf01_loop;
set hf01_loop;
lag1_backed_ret_aic = lag1(backed_ret_aic);
lag2_backed_ret_aic = lag2(backed_ret_aic);
lag3_backed_ret_aic = lag3(backed_ret_aic);

if fund_seq = &i then backed_ret_aic = (dem_ret - sel_theta_1*lag1_backed_ret_aic - sel_theta_2*lag2_backed_ret_aic - sel_theta_3*lag3_backed_ret_aic)/sel_theta_0;
run;

%end;

%mend back_out_rets;
%back_out_rets;

*clean dataset and add back mean;
data hf01_loop;
set hf01_loop;
temp_backed_ret_aic = backed_ret_aic + av_ret_fundid;
keep categ fundid_mer yyyymm temp_backed_ret_aic;
run;

proc sort data = hf01_loop; by categ fundid_mer yyyymm; run;
proc sort data = hf01; by categ fundid_mer yyyymm; run;

data hf02;
merge hf01 hf01_loop; 
by categ fundid_mer yyyymm; 
run;

*Adjust mean;
*check std of smoothed and unsmoothed returns;
proc sort data = hf02; by categ fundid_mer; run;
proc summary data = hf02; 
var ret temp_backed_ret_aic;
output out = v_hf02_ret_mean
mean = std = /autoname;
by categ fundid_mer; 
run;

data hf02;
merge hf02 v_hf02_ret_mean;
by categ fundid_mer; 
backed_ret_aic = temp_backed_ret_aic + ret_mean - temp_backed_ret_aic_mean;
drop _TYPE_ _FREQ_;
run;

*Aggregate;
proc sort data = hf02; by aut_rank categ yyyymm; run;
proc summary data = hf02; 
var ret backed_ret_aic;
output out = hf2_ewret mean = ret_ew backed_ret_aic_ew;
by aut_rank categ yyyymm; 
run;

proc sort data = hf2_ewret; by categ yyyymm; run;

data hf2_ewret;
set hf2_ewret;
funds_categ_mm = _FREQ_;
lag1_ret_ew = lag1(ret_ew);
if lag(categ) ne categ then lag1_ret_ew = . ;
lag2_ret_ew = lag2(ret_ew);
if lag2(categ) ne categ then lag2_ret_ew = . ;
lag1_backed_ret_aic_ew = lag1(backed_ret_aic_ew);
if lag(categ) ne categ then lag1_backed_ret_aic_ew = . ;
lag2_backed_ret_aic_ew = lag2(backed_ret_aic_ew);
if lag2(categ) ne categ then lag2_backed_ret_aic_ew = . ;
drop _TYPE_ _FREQ_;
run;


********** 3-step Unsmoothing;
**** FIRST STEP: GET AGGREGATE ECONOMIC (UNSMOOTHED) RETURNS;

data s1ag00;
set hf2_ewret;
keep aut_rank categ yyyymm ret_ew funds_categ_mm;
run;

*demean;
proc sort data = s1ag00; by categ; run;
proc summary data = s1ag00; 
var ret_ew funds_categ_mm;
output out = w_mean_ew_ret mean = av_aggrret_categ av_funds_categ_mm;
by categ; run;

data s1ag00;
merge s1ag00 w_mean_ew_ret;
by categ;
dem_catret_ew = ret_ew - av_aggrret_categ;
drop _TYPE_ _FREQ_ funds_categ_mm av_funds_categ_mm;
run;

*category sequence #;
proc sort data = s1ag00; by categ yyyymm; run;
data s1ag00;
set s1ag00;
by categ;
if first.categ then categ_seq = 1;
else categ_seq + 1;
run; 

*Apply MA smothing;
*MA with 3 lags;

proc sort data = s1ag00; by categ yyyymm; run;
proc arima data= s1ag00;
identify var = dem_catret_ew noprint;
estimate q= 3 noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= ag_arima_ma3_est OUTSTAT=ag_arima_ma3_diag noprint;
*forecast noprint;
by categ;
run;
quit;

*Estimated PAIs for MA(3) case;
data ag_arima_ma3_est;
set ag_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_pai_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_pai_0 = 1/ma3_pai_sum;
ma3_pai_1 = -ma1_1/ma3_pai_sum;
ma3_pai_2 = -ma1_2/ma3_pai_sum;
ma3_pai_3 = -ma1_3/ma3_pai_sum;
ma3_pai_sum_norm = ma3_pai_0 + ma3_pai_1 + ma3_pai_2;
keep categ ma3_STATUS_ ma3_pai_0 ma3_pai_1 ma3_pai_2 ma3_pai_3 ma3_pai_sum;
run;

*attach to main dataset;
proc sort data = ag_arima_ma3_est; by categ; run;
proc sort data = s1ag00; by categ; run;

data s1ag01;
merge s1ag00 ag_arima_ma3_est;
by categ;
run;

data s1ag01;
set s1ag01;
ma3_gt_1_5 = 0; 
if ma3_pai_0 gt 1.25 or ma3_pai_1 gt 1.25 or ma3_pai_2 gt 1.25 or ma3_pai_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_pai_0 le -0.45 or ma3_pai_1 le -0.45 or ma3_pai_2 le -0.45 or ma3_pai_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_pai_0 = 1; sel_pai_1 = 0; sel_pai_2 = 0; sel_pai_3 = 0;
sel_pai_0 = ma3_pai_0;
sel_pai_1 = ma3_pai_1;
sel_pai_2 = ma3_pai_2;
sel_pai_3 = ma3_pai_3;
if ma3_gt_1_5 = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
if ma3_status = 1 then do;
sel_pai_0 = 1; 
sel_pai_1 = 0; 
sel_pai_2 = 0; 
sel_pai_3 = 0;
end;
run;

*Aggregate MA coefficients (pais);
data s1_ag_pais; set s1ag01; keep aut_rank categ sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3; run;
proc sort data = s1_ag_pais nodupkey; by aut_rank categ; run;


*back out estimated returns for aggregate returns;
proc sort data = s1ag01; by categ yyyymm; run;

%macro back_out_catrets;

data s1ag01_loop;
set s1ag01;
backed_catret_aic = dem_catret_ew;
keep aut_rank categ categ_seq backed_catret_aic dem_catret_ew yyyymm av_aggrret_categ
sel_pai_0 sel_pai_1 sel_pai_2 sel_pai_3;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;

%do i = 4 %to 276; *MA3;

data s1ag01_loop;
set s1ag01_loop;
lag1_backed_catret_aic = lag1(backed_catret_aic);
lag2_backed_catret_aic = lag2(backed_catret_aic);
lag3_backed_catret_aic = lag3(backed_catret_aic);

if categ_seq = &i then backed_catret_aic = (dem_catret_ew - sel_pai_1*lag1_backed_catret_aic - sel_pai_2*lag2_backed_catret_aic - sel_pai_3*lag3_backed_catret_aic)/sel_pai_0;
run;

%end;

%mend back_out_catrets;
%back_out_catrets;


*check average backed ret by category;
proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop mean = /autoname;
by categ;
run;

data s1ag01_loop; 
merge s1ag01_loop check_av_s1ag01_loop;
by categ; 
temp_backed_catret_aic = backed_catret_aic;
drop backed_catret_aic;
run;

data s1ag01_loop; 
set s1ag01_loop; 
backed_catret_aic = temp_backed_catret_aic - backed_catret_aic_mean;
drop backed_catret_aic_mean temp_backed_catret_aic _FREQ_ _TYPE_;
run;

proc sort data = s1ag01_loop; by categ; run;
proc summary data = s1ag01_loop; 
var backed_catret_aic;
output out = check_av_s1ag01_loop_v2 mean = /autoname;
by categ;
run;

*clean dataset and add back mean;
data s1ag01_loop;
set s1ag01_loop;
dem_backed_catret_aic = backed_catret_aic;
backed_catret_aic = backed_catret_aic + av_aggrret_categ;
keep categ yyyymm backed_catret_aic dem_backed_catret_aic;
run;

proc sort data = s1ag01_loop; by categ yyyymm; run;
proc sort data = s1ag01; by categ yyyymm; run;

data s1ag02;
merge s1ag01 s1ag01_loop; 
by categ yyyymm; 
run;

*****************************************;
*Continue to step 2 of 3-step procedure;

proc sort data = s1ag02; by categ; run;
proc corr data = s1ag02 noprint out= corr_catret_backed; 
by categ; 
var ret_ew backed_catret_aic;
run;

data corr_catret_backed; 
set corr_catret_backed; 
if _TYPE_ = 'MEAN' or _TYPE_ = "STD" or _NAME_ = "ret_ew"; 
est_type = _TYPE_; 
drop _TYPE_; 
run;

data corr_catret_backed; 
set corr_catret_backed; 
change_aic = backed_catret_aic/ret_ew - 1;
run;

proc sort data = corr_catret_backed; by descending est_type descending change_aic; run;

***********************************************************************************;
* SECOND STEP: GET FUND-LEVEL UNSMOOTHED EXCESS RETURNS (EXCESS WRT EW CATEGORY);
* Unsmooth using 3 lags of excess category returns as covariates;

data s2fund00; *first get fund-level data;
set hf00;
keep ret yyyymm fundid_mer categ fund_seq av_ret_fundid aut_rank;
run;

data w_au_ewcatert; *keep unsmoothed series as well;
set s1ag02;
catret_ew = ret_ew;
keep categ yyyymm catret_ew backed_catret_aic;
run;

proc sort data = w_au_ewcatert; by categ yyyymm; run;
data w_au_ewcatert;
set w_au_ewcatert;
*1 lag;
lag1_backed_catret_aic = lag1(backed_catret_aic);
if categ ne lag1(categ) then lag1_backed_catret_aic = . ;
*2 lag;
lag2_backed_catret_aic = lag2(backed_catret_aic);
if categ ne lag2(categ) then lag2_backed_catret_aic = . ;
*3 lag;
lag3_backed_catret_aic = lag3(backed_catret_aic);
if categ ne lag3(categ) then lag3_backed_catret_aic = . ;
run;

*attach category-month returns and demean;
proc sort data = s2fund00; by categ yyyymm; run;
proc sort data = w_au_ewcatert; by categ yyyymm; run;

data s2fund00;
merge s2fund00 w_au_ewcatert; 
by categ yyyymm; 
ret_excat = ret - catret_ew;
run;

*get fund's average return in excess of the category return;
*also demean unsmoothed EW categ return;

proc sort data = s2fund00; by categ fundid_mer; run;
proc summary data = s2fund00; 
var ret_excat backed_catret_aic
lag1_backed_catret_aic
lag2_backed_catret_aic
lag3_backed_catret_aic;
output out = w_mean_excat_ret 
mean = av_retexcat_fundid av_backed_catret_aic_fund
av_lag1_backed_catret_aic_fund
av_lag2_backed_catret_aic_fund
av_lag3_backed_catret_aic_fund;
by categ fundid_mer; 
run;

data s2fund00;
merge s2fund00 w_mean_excat_ret;
by categ fundid_mer; 
dem_ret_excat = ret_excat - av_retexcat_fundid;
dem_backed_catret_aic = backed_catret_aic - av_backed_catret_aic_fund;
dem_lag1_backed_catret_aic = lag1_backed_catret_aic - av_lag1_backed_catret_aic_fund;
dem_lag2_backed_catret_aic = lag2_backed_catret_aic - av_lag2_backed_catret_aic_fund;
dem_lag3_backed_catret_aic = lag3_backed_catret_aic - av_lag3_backed_catret_aic_fund;
run;

*check average dem_ret_excat;
proc summary data = s2fund00;
var dem_ret_excat dem_backed_catret_aic dem_lag1_backed_catret_aic;
output out = check_avg_dem_ret_excat mean = /autoname;
by categ fundid_mer; 
run;

*Estimate MA on returns in excess of category returns;
*MA with 3 lags;
proc sort data = s2fund00; by categ fundid_mer yyyymm; run;
proc arima data= s2fund00;
identify var = dem_ret_excat crosscorr = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noprint;
estimate q= 3 input = (dem_backed_catret_aic dem_lag1_backed_catret_aic dem_lag2_backed_catret_aic dem_lag3_backed_catret_aic) noint ma = -0.2 -0.2 -0.2 method=ml OUTEST= excat_arima_ma3_est OUTSTAT= excat_arima_ma3_diag noprint;
*forecast noprint;
by categ fundid_mer;
run;
quit;

*MA3;
data excat_arima_ma3_est;
set excat_arima_ma3_est;
if _TYPE_ = 'EST';
ma3_STATUS_ = _STATUS_;
ma3_phi_sum = 1 - ma1_1 - ma1_2 - ma1_3;
ma3_phi_0 = 1/ma3_phi_sum;
ma3_phi_1 = -ma1_1/ma3_phi_sum;
ma3_phi_2 = -ma1_2/ma3_phi_sum;
ma3_phi_3 = -ma1_3/ma3_phi_sum;
ma3_phi_sum_norm = ma3_phi_0 + ma3_phi_1 + ma3_phi_2;
keep categ fundid_mer ma3_STATUS_ ma3_phi_0 ma3_phi_1 ma3_phi_2 ma3_phi_3 ma3_phi_sum;
run;

*attach thetas to main dataset;
proc sort data = s2fund00; by categ fundid_mer; run;
proc sort data = excat_arima_ma3_est; by categ fundid_mer; run;

data s2fund01;
merge s2fund00 excat_arima_ma3_est;
by categ fundid_mer;
run;


data s2fund01;
set s2fund01;
ma3_gt_1_5 = 0; 
if ma3_phi_0 gt 1.25 or ma3_phi_1 gt 1.25 or ma3_phi_2 gt 1.25 or ma3_phi_3 gt 1.25 then ma3_gt_1_5 = 1;
if ma3_phi_0 le -0.45 or ma3_phi_1 le -0.45 or ma3_phi_2 le -0.45 or ma3_phi_3 le -0.45 then ma3_gt_1_5 = 1;
ma3_status = 0; if ma3_STATUS_ ne "0 Converged" then ma3_status = 1;
sel_phi_0 = ma3_phi_0; 
sel_phi_1 = ma3_phi_1; 
sel_phi_2 = ma3_phi_2; 
sel_phi_3 = ma3_phi_3;
if ma3_gt_1_5 = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
if ma3_status = 1 then do;
sel_phi_0 = 1; 
sel_phi_1 = 0; 
sel_phi_2 = 0; 
sel_phi_3 = 0;
end;
run;


*Aggregate MA coefficients (pais);
data w_s2_res_phis; set s2fund01; keep aut_rank categ fundid_mer sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3; run;
proc sort data = w_s2_res_phis nodupkey; by aut_rank categ fundid_mer; run;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis
mean = sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
by aut_rank categ; 
run;

*check outliers min max;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_minmax
min = max = /autoname;
by aut_rank categ; 
run;

*check outliers percentiles;
proc summary data = w_s2_res_phis; 
var sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3;
output out = s2_res_phis_p1p99
p1 = p99 = /autoname;
by aut_rank categ; 
run;

data tab_ma_coeff_all;
merge v_av_thetas s1_ag_pais s2_res_phis;
by aut_rank categ; 
drop _TYPE_ _FREQ_;
run;


*back out estimated excess returns;
proc sort data = s2fund01; by categ fundid_mer yyyymm; run;

%macro back_out_exrets;

data s2fund01_loop;
set s2fund01;
backed_ret_excat_aic = dem_ret_excat;
keep backed_ret_excat_aic dem_ret_excat categ fundid_mer yyyymm fund_seq av_ret_fundid
sel_phi_0 sel_phi_1 sel_phi_2 sel_phi_3 av_retexcat_fundid;
run;

proc sort data = s2fund01_loop; by categ fundid_mer yyyymm; run;

%do i = 4 %to 276; *MA3;

data s2fund01_loop;
set s2fund01_loop;
lag1_backed_ret_excat_aic = lag1(backed_ret_excat_aic);
lag2_backed_ret_excat_aic = lag2(backed_ret_excat_aic);
lag3_backed_ret_excat_aic = lag3(backed_ret_excat_aic);

if fund_seq = &i then backed_ret_excat_aic = (dem_ret_excat - sel_phi_1*lag1_backed_ret_excat_aic - sel_phi_2*lag2_backed_ret_excat_aic - sel_phi_3*lag3_backed_ret_excat_aic)/sel_phi_0;
run;

%end;

%mend back_out_exrets;
%back_out_exrets;


*clean dataset and keep adjusted residual;
data s2fund01_loop;
set s2fund01_loop;
res_backed_ret_excat_aic = backed_ret_excat_aic;
keep categ fundid_mer yyyymm res_backed_ret_excat_aic;
run;

proc sort data = s2fund01_loop; by categ fundid_mer yyyymm; run;
proc sort data = s2fund01; by categ fundid_mer yyyymm; run;

data s2fund02;
merge s2fund01 s2fund01_loop; 
by categ fundid_mer yyyymm; 
run;

*check if residuals from this step have mean = 0;

proc summary data = s2fund02;
var dem_ret_excat res_backed_ret_excat_aic;
output out = check_step2_mean_excatret mean = /autoname;
by categ fundid_mer; 
run;

***********************************************************************;
**** STEP 3 : Add up residual from aggregate MA(3) and excess ret MA(3);
*get aggregate MA residual;
data s1_resid;
set s1ag02;
keep categ yyyymm av_aggrret_categ 
backed_catret_aic dem_backed_catret_aic;
run;

*get fund-level cat-excess return MA residual;
data s2_resid;
set s2fund02;
keep categ fundid_mer yyyymm dem_ret_excat res_backed_ret_excat_aic;
run;

*attach to main dataset;
proc sort data = hf02; by categ yyyymm; run;
proc sort data = s1_resid; by categ yyyymm; run;

data s3_00;
merge hf02 s1_resid; 
by categ yyyymm; 
run;

proc sort data = s3_00; by categ fundid_mer yyyymm; run;
proc sort data = s2_resid; by categ fundid_mer yyyymm; run;

data s3_00;
merge s3_00 s2_resid; 
by categ fundid_mer yyyymm;
run;

*Now calculate unsmoothed return with new method;
data s3_00;
set s3_00;
s3_uns_ret_aic_temp = dem_backed_catret_aic + res_backed_ret_excat_aic + av_ret_fundid;
run;

*fix unsmoothed return based on 3-step method;
proc sort data = s3_00; by categ fundid_mer; run;
proc summary data = s3_00; 
var s3_uns_ret_aic_temp av_ret_fundid;
output out = w_adj_s3_uns mean = /autoname;
by categ fundid_mer; 
run;

data s3_00;
merge s3_00 w_adj_s3_uns;
by categ fundid_mer; 
s3_uns_ret_aic = s3_uns_ret_aic_temp - s3_uns_ret_aic_temp_mean + av_ret_fundid_mean;
drop s3_uns_ret_aic_temp_mean av_ret_fundid_mean _TYPE_ _FREQ_;
run;

*check averages;
proc sort data = s3_00; by categ; run;
proc summary data = s3_00;
var ret av_ret_fundid backed_ret_aic s3_uns_ret_aic 
dem_backed_catret_aic res_backed_ret_excat_aic;
output out = check_avret_s3_00_all mean = /autoname;
by categ;
run;

/********************************************************************************************/
/********************************************************************************************/

***************************** PROCEED TO ANALYSIS OF UNSMOOTHED DATA;

**** Based on code "HF_output";
** Start from file with unsmoothed data;

data s3_00a; 
set s3_00; 
glm_ret = backed_ret_aic; *choose backed_ret_aic or backed_ret_fix;
s3_ret = s3_uns_ret_aic; *choose s3_uns_ret_aic or s3_uns_ret_fix;
uns_catret = backed_catret_aic; *choose aic or fix;
keep fundid_mer yyyymm
ret categ aut_rank strat_il_gr fundid_mer_obs fund_seq
glm_ret s3_ret uns_catret;
run;


*Read HF factors;
data hffact;
set vread_factor00;
rf = 0;
run;
proc sort data = hffact nodupkey; by categ aut_rank yyyymm; run;

*merge;
proc sort data = s3_00a; by categ aut_rank yyyymm; run;
proc sort data = hffact; by categ aut_rank yyyymm; run;

data s3_00a; 
merge s3_00a(in=a) hffact;
by categ aut_rank yyyymm; 
if a;
run;

data s3_00a; 
set s3_00a; 
retrf = ret - rf;
glm_retrf = glm_ret - rf;
s3_retrf = s3_ret - rf;
run;

*Number of funds and T for each strategy;
proc sort data = s3_00a; by aut_rank categ fundid_mer yyyymm; run;
proc summary data = s3_00a; 
var ret;
output out = v_count_obs_fund n = n_ret;
by aut_rank categ fundid_mer; 
run;
proc summary data = v_count_obs_fund; 
var n_ret;
output out = v_count_obs_categ n = n_funds mean = avg_T;
by aut_rank categ; 
run;

***************** Return on lagged returns Table;
*No need to drop first 3 observations in this step;
*use 4 lags;

*Not in excess of Rf and not demeaned;
data lagret_fund;
set s3_00a;
keep ret yyyymm fundid_mer categ fund_seq aut_rank strat_il_gr
glm_ret s3_ret;
run;

*REPEAT 3 TIMES FOR EACH RETURN;
*RET;

%let f_rets = ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by categ fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;

data v_aucorr_fund_r0;
set lagret_fund_reg_avg_&f_rets;
if type = "univar";
if statis = "PARMS";
r0_corr1_fund = lag1_rets;
keep categ statis r0_corr1_fund;
run;

*Now aggregate;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;

data v_aucorr_aggr_r0;
set lagret_fund_reg_ew_&f_rets;
if type = "univar";
if statis = "PARMS";
r0_corr1_aggr = lag1_rets_ew;
keep categ statis r0_corr1_aggr;
run;


****************************;
* GLM_RET;

%let f_rets = glm_ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by categ fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;

data v_aucorr_fund_r1;
set lagret_fund_reg_avg_&f_rets;
if type = "univar";
if statis = "PARMS";
r1_corr1_fund = lag1_rets;
keep categ statis r1_corr1_fund;
run;


*Now aggregate;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;

data v_aucorr_aggr_r1;
set lagret_fund_reg_ew_&f_rets;
if type = "univar";
if statis = "PARMS";
r1_corr1_aggr = lag1_rets_ew;
keep categ statis r1_corr1_aggr;
run;

****************************;
* S3 RET;

%let f_rets = s3_ret;

data lagret_fund_select; 
set lagret_fund; 
rets = &f_rets;
run;

proc sort data = lagret_fund_select; by categ fundid_mer yyyymm; run;

data lagret_fund_select;
set lagret_fund_select;
*lag ret;
lag1_rets = lag1(rets);
if fund_seq le 1 then lag1_rets = . ;
lag2_rets = lag2(rets);
if fund_seq le 2 then lag2_rets = . ;
lag3_rets = lag3(rets);
if fund_seq le 3 then lag3_rets = . ;
lag4_rets = lag4(rets);
if fund_seq le 4 then lag4_rets = . ;
run;

data lagret_fund_clean; set lagret_fund_select; if fund_seq gt 4; run;

*run regressions;
proc sort data = lagret_fund_clean; by aut_rank categ fundid_mer yyyymm; run;
proc reg data = lagret_fund_clean outest = v_lagret_fund_reg noprint tableout;
*multivariate;
model rets = lag1_rets lag2_rets lag3_rets lag4_rets /edf ADJRSQ;
*unavariate;
model rets = lag1_rets /edf ADJRSQ;
model rets = lag2_rets /edf ADJRSQ;
model rets = lag3_rets /edf ADJRSQ;
model rets = lag4_rets /edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_lagret_fund_reg;
set v_lagret_fund_reg;
if _TYPE_ = 'PARMS' or _TYPE_ = 'T';
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
t_lag1_rets_ge_165 = 0;
t_lag2_rets_ge_165 = 0;
t_lag3_rets_ge_165 = 0;
t_lag4_rets_ge_165 = 0;
if _TYPE_ = 'T' and lag1_rets ge 1.65 then t_lag1_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag2_rets ge 1.65 then t_lag2_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag3_rets ge 1.65 then t_lag3_rets_ge_165 = 1;
if _TYPE_ = 'T' and lag4_rets ge 1.65 then t_lag4_rets_ge_165 = 1;
statis = _TYPE_;
run;

*average coefficients and T-stat;
proc sort data = v_lagret_fund_reg; by model_n aut_rank categ statis; run;
proc summary data = v_lagret_fund_reg; 
var lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
output out = v_lagret_fund_reg_avg 
mean = lag1_rets lag2_rets lag3_rets lag4_rets 
t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
by model_n aut_rank categ statis; 
run;

data wv_lagret_fund_reg_avg1;
set v_lagret_fund_reg_avg(drop = lag1_rets lag2_rets lag3_rets lag4_rets);
if statis = 'T';
statis = 't_p10';
lag1_rets = t_lag1_rets_ge_165;
lag2_rets = t_lag2_rets_ge_165;
lag3_rets = t_lag3_rets_ge_165;
lag4_rets = t_lag4_rets_ge_165;
run;

*attach to main dataset, then organize;
data w_lagret_fund_reg_avg_org;
set v_lagret_fund_reg_avg wv_lagret_fund_reg_avg1;
drop _TYPE_ t_lag1_rets_ge_165 t_lag2_rets_ge_165 t_lag3_rets_ge_165 t_lag4_rets_ge_165;
run;

proc sort data = w_lagret_fund_reg_avg_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_avg_org_m1; set w_lagret_fund_reg_avg_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_avg_org_m2; set w_lagret_fund_reg_avg_org; if model_n = 2; drop lag2_rets lag3_rets lag4_rets; run;
data w_lagret_fund_reg_avg_org_m3; set w_lagret_fund_reg_avg_org; if model_n = 3; keep lag2_rets; run;
data w_lagret_fund_reg_avg_org_m4; set w_lagret_fund_reg_avg_org; if model_n = 4; keep lag3_rets; run;
data w_lagret_fund_reg_avg_org_m5; set w_lagret_fund_reg_avg_org; if model_n = 5; keep lag4_rets; run;

data w_lagret_fund_reg_avg_org_m2345;
merge w_lagret_fund_reg_avg_org_m2 w_lagret_fund_reg_avg_org_m3 w_lagret_fund_reg_avg_org_m4 w_lagret_fund_reg_avg_org_m5;
type = 'univar';
run;


data lagret_fund_reg_avg_&f_rets;
set w_lagret_fund_reg_avg_org_m1 w_lagret_fund_reg_avg_org_m2345;
run;

proc sort data = lagret_fund_reg_avg_&f_rets; by model_n aut_rank categ statis; run;

data v_aucorr_fund_r3;
set lagret_fund_reg_avg_&f_rets;
if type = "univar";
if statis = "PARMS";
r3_corr1_fund = lag1_rets;
keep categ statis r3_corr1_fund;
run;


*Now aggregate;
proc sort data = lagret_fund_select; by aut_rank categ yyyymm; run;
proc summary data = lagret_fund_select; 
var rets;
output out = lagret_fund_select_ew mean = rets_ew;
by aut_rank categ yyyymm; 
run;

data lagret_fund_select_ew;
set lagret_fund_select_ew;
*lag ret;
lag1_rets_ew = lag1(rets_ew);
if yyyymm le 199501 then lag1_rets_ew = . ;
lag2_rets_ew = lag2(rets_ew);
if yyyymm le 199502 then lag2_rets_ew = . ;
lag3_rets_ew = lag3(rets_ew);
if yyyymm le 199503 then lag3_rets_ew = . ;
lag4_rets_ew = lag4(rets_ew);
if yyyymm le 199504 then lag4_rets_ew = . ;
drop _TYPE_ _FREQ_;
run;

data lagret_fund_clean_ew; set lagret_fund_select_ew; if lag4_rets_ew ne . ; run;

proc sort data = lagret_fund_clean_ew; by aut_rank categ yyyymm; run;
proc reg data = lagret_fund_clean_ew outest = v_lagret_fund_reg_ew noprint tableout;
*multivariate;
model rets_ew = lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew /edf ADJRSQ;
*unavariate;
model rets_ew = lag1_rets_ew /edf ADJRSQ;
model rets_ew = lag2_rets_ew /edf ADJRSQ;
model rets_ew = lag3_rets_ew /edf ADJRSQ;
model rets_ew = lag4_rets_ew /edf ADJRSQ;
by aut_rank categ; 
run; quit;

data w_lagret_fund_reg_ew_org;
set v_lagret_fund_reg_ew;
model_n = 100;
model_n = substr(_MODEL_,6,2)*1;
statis = _TYPE_;
if _TYPE_ = 'PVALUE' then statis = 'w_pv';
if _TYPE_ = 'PARMS' or _TYPE_ = 'T' or _TYPE_ = 'PVALUE';
keep aut_rank statis categ model_n _TYPE_ lag1_rets_ew lag2_rets_ew lag3_rets_ew lag4_rets_ew;
run;

proc sort data = w_lagret_fund_reg_ew_org; by model_n aut_rank categ statis; run;

data w_lagret_fund_reg_ew_org_m1; set w_lagret_fund_reg_ew_org; if model_n = 1; type = 'multivar'; run;

data w_lagret_fund_reg_ew_org_m2; set w_lagret_fund_reg_ew_org; if model_n = 2; drop lag2_rets_ew lag3_rets_ew lag4_rets_ew; run;
data w_lagret_fund_reg_ew_org_m3; set w_lagret_fund_reg_ew_org; if model_n = 3; keep lag2_rets_ew; run;
data w_lagret_fund_reg_ew_org_m4; set w_lagret_fund_reg_ew_org; if model_n = 4; keep lag3_rets_ew; run;
data w_lagret_fund_reg_ew_org_m5; set w_lagret_fund_reg_ew_org; if model_n = 5; keep lag4_rets_ew; run;

data w_lagret_fund_reg_ew_org_m2345;
merge w_lagret_fund_reg_ew_org_m2 w_lagret_fund_reg_ew_org_m3 w_lagret_fund_reg_ew_org_m4 w_lagret_fund_reg_ew_org_m5;
type = 'univar';
run;

data lagret_fund_reg_ew_&f_rets;
set w_lagret_fund_reg_ew_org_m1 w_lagret_fund_reg_ew_org_m2345;
run;

proc sort data = lagret_fund_reg_ew_&f_rets; by model_n aut_rank statis; run;

data v_aucorr_aggr_r3;
set lagret_fund_reg_ew_&f_rets;
if type = "univar";
if statis = "PARMS";
r3_corr1_aggr = lag1_rets_ew;
keep categ statis r3_corr1_aggr;
run;

********************************************************************************************************************************;
******************* FH Regressions;
*run fund-by-fund regressions;

*First 3 observations are not unsmoothed, drop;
data s3_00_select;
set s3_00a;
if fund_seq gt 3;
run;

proc sort data = s3_00_select; by aut_rank categ fundid_mer yyyymm; run;

proc reg data = s3_00_select outest = v_s3_fund_fh00 noprint tableout;
*Regular;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
*Dimson 1 lag;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 /edf ADJRSQ;
*Dimson 2 lags;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8 /edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8 /edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8 /edf ADJRSQ;
*Dimson 3 lags;
model retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8/edf ADJRSQ;
model glm_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8/edf ADJRSQ;
model s3_retrf = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8 
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8/edf ADJRSQ;
by aut_rank categ fundid_mer; 
run; quit;

data v_s3_fund_fh00_coef_r0;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
if _MODEL_ = "MODEL1";
r0_fh_int = intercept;
r0_fh1 = fh1;
r0_fh2 = fh2;
r0_fh3 = fh3;
r0_fh4 = fh4;
r0_fh5 = fh5;
r0_fh6 = fh6;
r0_fh7 = fh7;
r0_fh8 = fh8;
reg_obs = _P_ + _EDF_;
keep aut_rank categ fundid_mer reg_obs
r0_fh_int r0_fh1 r0_fh2 r0_fh3 r0_fh4 r0_fh5 r0_fh6 r0_fh7 r0_fh8;
run;

data v_s3_fund_fh00_coef_r1;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
if _MODEL_ = "MODEL2";
r1_fh_int = intercept;
r1_fh1 = fh1;
r1_fh2 = fh2;
r1_fh3 = fh3;
r1_fh4 = fh4;
r1_fh5 = fh5;
r1_fh6 = fh6;
r1_fh7 = fh7;
r1_fh8 = fh8;
keep aut_rank categ fundid_mer 
r1_fh_int r1_fh1 r1_fh2 r1_fh3 r1_fh4 r1_fh5 r1_fh6 r1_fh7 r1_fh8;
run;

data v_s3_fund_fh00_coef_r3;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
if _MODEL_ = "MODEL3";
r3_fh_int = intercept;
r3_fh1 = fh1;
r3_fh2 = fh2;
r3_fh3 = fh3;
r3_fh4 = fh4;
r3_fh5 = fh5;
r3_fh6 = fh6;
r3_fh7 = fh7;
r3_fh8 = fh8;
keep aut_rank categ fundid_mer 
r3_fh_int r3_fh1 r3_fh2 r3_fh3 r3_fh4 r3_fh5 r3_fh6 r3_fh7 r3_fh8;
run;

data v_s3_fund_fh00_coef_r0d1; *R0, dimson 1 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
if _MODEL_ = "MODEL4";

r0_fh1_d1_l0 = fh1;
r0_fh2_d1_l0 = fh2;
r0_fh3_d1_l0 = fh3;
r0_fh4_d1_l0 = fh4;
r0_fh5_d1_l0 = fh5;
r0_fh6_d1_l0 = fh6;
r0_fh7_d1_l0 = fh7;
r0_fh8_d1_l0 = fh8;

r0_fh1_d1_l1 = l1_fh1;
r0_fh2_d1_l1 = l1_fh2;
r0_fh3_d1_l1 = l1_fh3;
r0_fh4_d1_l1 = l1_fh4;
r0_fh5_d1_l1 = l1_fh5;
r0_fh6_d1_l1 = l1_fh6;
r0_fh7_d1_l1 = l1_fh7;
r0_fh8_d1_l1 = l1_fh8;

r0_fh1_d1_sum = fh1 +l1_fh1;
r0_fh2_d1_sum = fh2 +l1_fh2;
r0_fh3_d1_sum = fh3 +l1_fh3;
r0_fh4_d1_sum = fh4 +l1_fh4;
r0_fh5_d1_sum = fh5 +l1_fh5;
r0_fh6_d1_sum = fh6 +l1_fh6;
r0_fh7_d1_sum = fh7 +l1_fh7;
r0_fh8_d1_sum = fh8 +l1_fh8;
keep aut_rank categ fundid_mer 
r0_fh1_d1_l0 r0_fh2_d1_l0 r0_fh3_d1_l0 r0_fh4_d1_l0 r0_fh5_d1_l0 r0_fh6_d1_l0 r0_fh7_d1_l0 r0_fh8_d1_l0
r0_fh1_d1_l1 r0_fh2_d1_l1 r0_fh3_d1_l1 r0_fh4_d1_l1 r0_fh5_d1_l1 r0_fh6_d1_l1 r0_fh7_d1_l1 r0_fh8_d1_l1
r0_fh1_d1_sum r0_fh2_d1_sum r0_fh3_d1_sum r0_fh4_d1_sum r0_fh5_d1_sum r0_fh6_d1_sum r0_fh7_d1_sum r0_fh8_d1_sum;
run;

data v_s3_fund_fh00_coef_r1d1; *R1, dimson 1 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
if _MODEL_ = "MODEL5";

r1_fh1_d1_l0 = fh1;
r1_fh2_d1_l0 = fh2;
r1_fh3_d1_l0 = fh3;
r1_fh4_d1_l0 = fh4;
r1_fh5_d1_l0 = fh5;
r1_fh6_d1_l0 = fh6;
r1_fh7_d1_l0 = fh7;
r1_fh8_d1_l0 = fh8;

r1_fh1_d1_l1 = l1_fh1;
r1_fh2_d1_l1 = l1_fh2;
r1_fh3_d1_l1 = l1_fh3;
r1_fh4_d1_l1 = l1_fh4;
r1_fh5_d1_l1 = l1_fh5;
r1_fh6_d1_l1 = l1_fh6;
r1_fh7_d1_l1 = l1_fh7;
r1_fh8_d1_l1 = l1_fh8;

r1_fh1_d1_sum = fh1 +l1_fh1;
r1_fh2_d1_sum = fh2 +l1_fh2;
r1_fh3_d1_sum = fh3 +l1_fh3;
r1_fh4_d1_sum = fh4 +l1_fh4;
r1_fh5_d1_sum = fh5 +l1_fh5;
r1_fh6_d1_sum = fh6 +l1_fh6;
r1_fh7_d1_sum = fh7 +l1_fh7;
r1_fh8_d1_sum = fh8 +l1_fh8;
keep aut_rank categ fundid_mer 
r1_fh1_d1_l0 r1_fh2_d1_l0 r1_fh3_d1_l0 r1_fh4_d1_l0 r1_fh5_d1_l0 r1_fh6_d1_l0 r1_fh7_d1_l0 r1_fh8_d1_l0
r1_fh1_d1_l1 r1_fh2_d1_l1 r1_fh3_d1_l1 r1_fh4_d1_l1 r1_fh5_d1_l1 r1_fh6_d1_l1 r1_fh7_d1_l1 r1_fh8_d1_l1
r1_fh1_d1_sum r1_fh2_d1_sum r1_fh3_d1_sum r1_fh4_d1_sum r1_fh5_d1_sum r1_fh6_d1_sum r1_fh7_d1_sum r1_fh8_d1_sum;
run;


data v_s3_fund_fh00_coef_r3d1; *R3, dimson 1 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
if _MODEL_ = "MODEL6";

r3_fh1_d1_l0 = fh1;
r3_fh2_d1_l0 = fh2;
r3_fh3_d1_l0 = fh3;
r3_fh4_d1_l0 = fh4;
r3_fh5_d1_l0 = fh5;
r3_fh6_d1_l0 = fh6;
r3_fh7_d1_l0 = fh7;
r3_fh8_d1_l0 = fh8;

r3_fh1_d1_l1 = l1_fh1;
r3_fh2_d1_l1 = l1_fh2;
r3_fh3_d1_l1 = l1_fh3;
r3_fh4_d1_l1 = l1_fh4;
r3_fh5_d1_l1 = l1_fh5;
r3_fh6_d1_l1 = l1_fh6;
r3_fh7_d1_l1 = l1_fh7;
r3_fh8_d1_l1 = l1_fh8;

r3_fh1_d1_sum = fh1 +l1_fh1;
r3_fh2_d1_sum = fh2 +l1_fh2;
r3_fh3_d1_sum = fh3 +l1_fh3;
r3_fh4_d1_sum = fh4 +l1_fh4;
r3_fh5_d1_sum = fh5 +l1_fh5;
r3_fh6_d1_sum = fh6 +l1_fh6;
r3_fh7_d1_sum = fh7 +l1_fh7;
r3_fh8_d1_sum = fh8 +l1_fh8;
keep aut_rank categ fundid_mer 
r3_fh1_d1_l0 r3_fh2_d1_l0 r3_fh3_d1_l0 r3_fh4_d1_l0 r3_fh5_d1_l0 r3_fh6_d1_l0 r3_fh7_d1_l0 r3_fh8_d1_l0
r3_fh1_d1_l1 r3_fh2_d1_l1 r3_fh3_d1_l1 r3_fh4_d1_l1 r3_fh5_d1_l1 r3_fh6_d1_l1 r3_fh7_d1_l1 r3_fh8_d1_l1
r3_fh1_d1_sum r3_fh2_d1_sum r3_fh3_d1_sum r3_fh4_d1_sum r3_fh5_d1_sum r3_fh6_d1_sum r3_fh7_d1_sum r3_fh8_d1_sum;
run;


data v_s3_fund_fh00_coef_r0d2; *R0, dimson 2 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
if _MODEL_ = "MODEL7";

r0_fh1_d2_l0 = fh1;
r0_fh2_d2_l0 = fh2;
r0_fh3_d2_l0 = fh3;
r0_fh4_d2_l0 = fh4;
r0_fh5_d2_l0 = fh5;
r0_fh6_d2_l0 = fh6;
r0_fh7_d2_l0 = fh7;
r0_fh8_d2_l0 = fh8;

r0_fh1_d2_l1 = l1_fh1;
r0_fh2_d2_l1 = l1_fh2;
r0_fh3_d2_l1 = l1_fh3;
r0_fh4_d2_l1 = l1_fh4;
r0_fh5_d2_l1 = l1_fh5;
r0_fh6_d2_l1 = l1_fh6;
r0_fh7_d2_l1 = l1_fh7;
r0_fh8_d2_l1 = l1_fh8;

r0_fh1_d2_l2 = l2_fh1;
r0_fh2_d2_l2 = l2_fh2;
r0_fh3_d2_l2 = l2_fh3;
r0_fh4_d2_l2 = l2_fh4;
r0_fh5_d2_l2 = l2_fh5;
r0_fh6_d2_l2 = l2_fh6;
r0_fh7_d2_l2 = l2_fh7;
r0_fh8_d2_l2 = l2_fh8;

r0_fh1_d2_sum = fh1 +l1_fh1 +l2_fh1;
r0_fh2_d2_sum = fh2 +l1_fh2 +l2_fh2;
r0_fh3_d2_sum = fh3 +l1_fh3 +l2_fh3;
r0_fh4_d2_sum = fh4 +l1_fh4 +l2_fh4;
r0_fh5_d2_sum = fh5 +l1_fh5 +l2_fh5;
r0_fh6_d2_sum = fh6 +l1_fh6 +l2_fh6;
r0_fh7_d2_sum = fh7 +l1_fh7 +l2_fh7;
r0_fh8_d2_sum = fh8 +l1_fh8 +l2_fh8;
keep aut_rank categ fundid_mer 
r0_fh1_d2_l0 r0_fh2_d2_l0 r0_fh3_d2_l0 r0_fh4_d2_l0 r0_fh5_d2_l0 r0_fh6_d2_l0 r0_fh7_d2_l0 r0_fh8_d2_l0
r0_fh1_d2_l1 r0_fh2_d2_l1 r0_fh3_d2_l1 r0_fh4_d2_l1 r0_fh5_d2_l1 r0_fh6_d2_l1 r0_fh7_d2_l1 r0_fh8_d2_l1
r0_fh1_d2_l2 r0_fh2_d2_l2 r0_fh3_d2_l2 r0_fh4_d2_l2 r0_fh5_d2_l2 r0_fh6_d2_l2 r0_fh7_d2_l2 r0_fh8_d2_l2
r0_fh1_d2_sum r0_fh2_d2_sum r0_fh3_d2_sum r0_fh4_d2_sum r0_fh5_d2_sum r0_fh6_d2_sum r0_fh7_d2_sum r0_fh8_d2_sum;
run;


data v_s3_fund_fh00_coef_r1d2; *R1, dimson 2 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
if _MODEL_ = "MODEL8";

r1_fh1_d2_l0 = fh1;
r1_fh2_d2_l0 = fh2;
r1_fh3_d2_l0 = fh3;
r1_fh4_d2_l0 = fh4;
r1_fh5_d2_l0 = fh5;
r1_fh6_d2_l0 = fh6;
r1_fh7_d2_l0 = fh7;
r1_fh8_d2_l0 = fh8;

r1_fh1_d2_l1 = l1_fh1;
r1_fh2_d2_l1 = l1_fh2;
r1_fh3_d2_l1 = l1_fh3;
r1_fh4_d2_l1 = l1_fh4;
r1_fh5_d2_l1 = l1_fh5;
r1_fh6_d2_l1 = l1_fh6;
r1_fh7_d2_l1 = l1_fh7;
r1_fh8_d2_l1 = l1_fh8;

r1_fh1_d2_l2 = l2_fh1;
r1_fh2_d2_l2 = l2_fh2;
r1_fh3_d2_l2 = l2_fh3;
r1_fh4_d2_l2 = l2_fh4;
r1_fh5_d2_l2 = l2_fh5;
r1_fh6_d2_l2 = l2_fh6;
r1_fh7_d2_l2 = l2_fh7;
r1_fh8_d2_l2 = l2_fh8;

r1_fh1_d2_sum = fh1 +l1_fh1 +l2_fh1;
r1_fh2_d2_sum = fh2 +l1_fh2 +l2_fh2;
r1_fh3_d2_sum = fh3 +l1_fh3 +l2_fh3;
r1_fh4_d2_sum = fh4 +l1_fh4 +l2_fh4;
r1_fh5_d2_sum = fh5 +l1_fh5 +l2_fh5;
r1_fh6_d2_sum = fh6 +l1_fh6 +l2_fh6;
r1_fh7_d2_sum = fh7 +l1_fh7 +l2_fh7;
r1_fh8_d2_sum = fh8 +l1_fh8 +l2_fh8;
keep aut_rank categ fundid_mer 
r1_fh1_d2_l0 r1_fh2_d2_l0 r1_fh3_d2_l0 r1_fh4_d2_l0 r1_fh5_d2_l0 r1_fh6_d2_l0 r1_fh7_d2_l0 r1_fh8_d2_l0
r1_fh1_d2_l1 r1_fh2_d2_l1 r1_fh3_d2_l1 r1_fh4_d2_l1 r1_fh5_d2_l1 r1_fh6_d2_l1 r1_fh7_d2_l1 r1_fh8_d2_l1
r1_fh1_d2_l2 r1_fh2_d2_l2 r1_fh3_d2_l2 r1_fh4_d2_l2 r1_fh5_d2_l2 r1_fh6_d2_l2 r1_fh7_d2_l2 r1_fh8_d2_l2
r1_fh1_d2_sum r1_fh2_d2_sum r1_fh3_d2_sum r1_fh4_d2_sum r1_fh5_d2_sum r1_fh6_d2_sum r1_fh7_d2_sum r1_fh8_d2_sum;
run;


data v_s3_fund_fh00_coef_r3d2; *R3, dimson 2 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
if _MODEL_ = "MODEL9";

r3_fh1_d2_l0 = fh1;
r3_fh2_d2_l0 = fh2;
r3_fh3_d2_l0 = fh3;
r3_fh4_d2_l0 = fh4;
r3_fh5_d2_l0 = fh5;
r3_fh6_d2_l0 = fh6;
r3_fh7_d2_l0 = fh7;
r3_fh8_d2_l0 = fh8;

r3_fh1_d2_l1 = l1_fh1;
r3_fh2_d2_l1 = l1_fh2;
r3_fh3_d2_l1 = l1_fh3;
r3_fh4_d2_l1 = l1_fh4;
r3_fh5_d2_l1 = l1_fh5;
r3_fh6_d2_l1 = l1_fh6;
r3_fh7_d2_l1 = l1_fh7;
r3_fh8_d2_l1 = l1_fh8;

r3_fh1_d2_l2 = l2_fh1;
r3_fh2_d2_l2 = l2_fh2;
r3_fh3_d2_l2 = l2_fh3;
r3_fh4_d2_l2 = l2_fh4;
r3_fh5_d2_l2 = l2_fh5;
r3_fh6_d2_l2 = l2_fh6;
r3_fh7_d2_l2 = l2_fh7;
r3_fh8_d2_l2 = l2_fh8;

r3_fh1_d2_sum = fh1 +l1_fh1 +l2_fh1;
r3_fh2_d2_sum = fh2 +l1_fh2 +l2_fh2;
r3_fh3_d2_sum = fh3 +l1_fh3 +l2_fh3;
r3_fh4_d2_sum = fh4 +l1_fh4 +l2_fh4;
r3_fh5_d2_sum = fh5 +l1_fh5 +l2_fh5;
r3_fh6_d2_sum = fh6 +l1_fh6 +l2_fh6;
r3_fh7_d2_sum = fh7 +l1_fh7 +l2_fh7;
r3_fh8_d2_sum = fh8 +l1_fh8 +l2_fh8;
keep aut_rank categ fundid_mer 
r3_fh1_d2_l0 r3_fh2_d2_l0 r3_fh3_d2_l0 r3_fh4_d2_l0 r3_fh5_d2_l0 r3_fh6_d2_l0 r3_fh7_d2_l0 r3_fh8_d2_l0
r3_fh1_d2_l1 r3_fh2_d2_l1 r3_fh3_d2_l1 r3_fh4_d2_l1 r3_fh5_d2_l1 r3_fh6_d2_l1 r3_fh7_d2_l1 r3_fh8_d2_l1
r3_fh1_d2_l2 r3_fh2_d2_l2 r3_fh3_d2_l2 r3_fh4_d2_l2 r3_fh5_d2_l2 r3_fh6_d2_l2 r3_fh7_d2_l2 r3_fh8_d2_l2
r3_fh1_d2_sum r3_fh2_d2_sum r3_fh3_d2_sum r3_fh4_d2_sum r3_fh5_d2_sum r3_fh6_d2_sum r3_fh7_d2_sum r3_fh8_d2_sum;
run;


data v_s3_fund_fh00_coef_r0d3; *R0, dimson 3 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'retrf';
if _MODEL_ = "MODEL10";
r0_fh1_d3_sum = fh1 + l1_fh1 + l2_fh1 + l3_fh1;
r0_fh2_d3_sum = fh2 + l1_fh2 + l2_fh2 + l3_fh2;
r0_fh3_d3_sum = fh3 + l1_fh3 + l2_fh3 + l3_fh3;
r0_fh4_d3_sum = fh4 + l1_fh4 + l2_fh4 + l3_fh4;
r0_fh5_d3_sum = fh5 + l1_fh5 + l2_fh5 + l3_fh5;
r0_fh6_d3_sum = fh6 + l1_fh6 + l2_fh6 + l3_fh6;
r0_fh7_d3_sum = fh7 + l1_fh7 + l2_fh7 + l3_fh7;
r0_fh8_d3_sum = fh8 + l1_fh8 + l2_fh8 + l3_fh8;
keep aut_rank categ fundid_mer 
r0_fh1_d3_sum r0_fh2_d3_sum r0_fh3_d3_sum r0_fh4_d3_sum r0_fh5_d3_sum r0_fh6_d3_sum r0_fh7_d3_sum r0_fh8_d3_sum;
run;

data v_s3_fund_fh00_coef_r1d3; *R1, dimson 3 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 'glm_retrf';
if _MODEL_ = "MODEL11";
r1_fh1_d3_sum = fh1 + l1_fh1 + l2_fh1 + l3_fh1;
r1_fh2_d3_sum = fh2 + l1_fh2 + l2_fh2 + l3_fh2;
r1_fh3_d3_sum = fh3 + l1_fh3 + l2_fh3 + l3_fh3;
r1_fh4_d3_sum = fh4 + l1_fh4 + l2_fh4 + l3_fh4;
r1_fh5_d3_sum = fh5 + l1_fh5 + l2_fh5 + l3_fh5;
r1_fh6_d3_sum = fh6 + l1_fh6 + l2_fh6 + l3_fh6;
r1_fh7_d3_sum = fh7 + l1_fh7 + l2_fh7 + l3_fh7;
r1_fh8_d3_sum = fh8 + l1_fh8 + l2_fh8 + l3_fh8;
keep aut_rank categ fundid_mer 
r1_fh1_d3_sum r1_fh2_d3_sum r1_fh3_d3_sum r1_fh4_d3_sum r1_fh5_d3_sum r1_fh6_d3_sum r1_fh7_d3_sum r1_fh8_d3_sum;
run;

data v_s3_fund_fh00_coef_r3d3; *R3, dimson 3 lag;
set v_s3_fund_fh00;
if _TYPE_ = 'PARMS';
if _DEPVAR_ = 's3_retrf';
if _MODEL_ = "MODEL12";
r3_fh1_d3_sum = fh1 + l1_fh1 + l2_fh1 + l3_fh1;
r3_fh2_d3_sum = fh2 + l1_fh2 + l2_fh2 + l3_fh2;
r3_fh3_d3_sum = fh3 + l1_fh3 + l2_fh3 + l3_fh3;
r3_fh4_d3_sum = fh4 + l1_fh4 + l2_fh4 + l3_fh4;
r3_fh5_d3_sum = fh5 + l1_fh5 + l2_fh5 + l3_fh5;
r3_fh6_d3_sum = fh6 + l1_fh6 + l2_fh6 + l3_fh6;
r3_fh7_d3_sum = fh7 + l1_fh7 + l2_fh7 + l3_fh7;
r3_fh8_d3_sum = fh8 + l1_fh8 + l2_fh8 + l3_fh8;
keep aut_rank categ fundid_mer 
r3_fh1_d3_sum r3_fh2_d3_sum r3_fh3_d3_sum r3_fh4_d3_sum r3_fh5_d3_sum r3_fh6_d3_sum r3_fh7_d3_sum r3_fh8_d3_sum;
run;

*merge all coefficients;
data v_s3_fund_fh00_coef_all;
merge v_s3_fund_fh00_coef_r0 v_s3_fund_fh00_coef_r1 v_s3_fund_fh00_coef_r3
v_s3_fund_fh00_coef_r0d1 v_s3_fund_fh00_coef_r1d1 v_s3_fund_fh00_coef_r3d1
v_s3_fund_fh00_coef_r0d2 v_s3_fund_fh00_coef_r1d2 v_s3_fund_fh00_coef_r3d2
v_s3_fund_fh00_coef_r0d3 v_s3_fund_fh00_coef_r1d3 v_s3_fund_fh00_coef_r3d3;
by aut_rank categ fundid_mer;
run;

proc sort data = v_s3_fund_fh00_coef_all; by aut_rank categ reg_obs; run;
proc summary data = v_s3_fund_fh00_coef_all; 
var r0_fh1_d2_sum r0_fh1_d3_sum r0_fh2_d2_sum r0_fh2_d3_sum 
r0_fh3_d2_sum r0_fh3_d3_sum r0_fh4_d2_sum r0_fh4_d3_sum;
output out = w_check_dims3_byregobs
mean = r0_fh1_d2_sum r0_fh1_d3_sum r0_fh2_d2_sum r0_fh2_d3_sum 
r0_fh3_d2_sum r0_fh3_d3_sum r0_fh4_d2_sum r0_fh4_d3_sum;
by aut_rank categ reg_obs; run;

*For funds with less than 4 years, do only 2 lags even for "dims3";
data v_s3_fund_fh00_coef_all;
set v_s3_fund_fh00_coef_all;
if reg_obs lt 45 then do;
r0_fh1_d3_sum = r0_fh1_d2_sum;
r0_fh2_d3_sum = r0_fh2_d2_sum;
r0_fh3_d3_sum = r0_fh3_d2_sum;
r0_fh4_d3_sum = r0_fh4_d2_sum;
r0_fh5_d3_sum = r0_fh5_d2_sum;
r0_fh6_d3_sum = r0_fh6_d2_sum;
r0_fh7_d3_sum = r0_fh7_d2_sum;
r0_fh8_d3_sum = r0_fh8_d2_sum;

r1_fh1_d3_sum = r1_fh1_d2_sum;
r1_fh2_d3_sum = r1_fh2_d2_sum;
r1_fh3_d3_sum = r1_fh3_d2_sum;
r1_fh4_d3_sum = r1_fh4_d2_sum;
r1_fh5_d3_sum = r1_fh5_d2_sum;
r1_fh6_d3_sum = r1_fh6_d2_sum;
r1_fh7_d3_sum = r1_fh7_d2_sum;
r1_fh8_d3_sum = r1_fh8_d2_sum;

r3_fh1_d3_sum = r3_fh1_d2_sum;
r3_fh2_d3_sum = r3_fh2_d2_sum;
r3_fh3_d3_sum = r3_fh3_d2_sum;
r3_fh4_d3_sum = r3_fh4_d2_sum;
r3_fh5_d3_sum = r3_fh5_d2_sum;
r3_fh6_d3_sum = r3_fh6_d2_sum;
r3_fh7_d3_sum = r3_fh7_d2_sum;
r3_fh8_d3_sum = r3_fh8_d2_sum;
end;
run;

*****************************************************;
*Further: Constrained Dimson Coefficients;
*Run macro for Constrained Dimson;
*Need fundid_med list;

data w_fundid_list; set s3_00_select; keep categ fundid_mer; run;
proc sort data = w_fundid_list nodupkey; by categ fundid_mer; run;

%macro cdims;

************Multiple Omega starting points;
*Start: 0.01, ... 0.96;
proc sort data = s3_00_select; by categ fundid_mer yyyymm; run;

%do multin= 1 %to 91 %by 15;
%let multi = %sysevalf(0.01*&multin);

***********************************;

****Reported - R0 - 3 lags;
%ODSOff
proc nlin data=s3_00_select outest = w_cd3_dim_r0 SMETHOD=GOLDEN noprint;
*ods output parameterestimates= w_cd3_dim_r0_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi omeg3 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1, 0<= omeg3 <= 1;
model retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8
+omeg3*b1*l3_fh1 +omeg3*b2*l3_fh2 +omeg3*b3*l3_fh3 +omeg3*b4*l3_fh4 +omeg3*b5*l3_fh5 +omeg3*b6*l3_fh6 +omeg3*b7*l3_fh7 +omeg3*b8*l3_fh8;
by categ fundid_mer;
run; quit;
%ODSOn

data w_cd3_dim_r0_&multin; *rename and clean;
set w_cd3_dim_r0;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r0_fh1_cd3_&multin = b1;
r0_fh2_cd3_&multin = b2;
r0_fh3_cd3_&multin = b3;
r0_fh4_cd3_&multin = b4;
r0_fh5_cd3_&multin = b5;
r0_fh6_cd3_&multin = b6;
r0_fh7_cd3_&multin = b7;
r0_fh8_cd3_&multin = b8;
r0_omeg1_cd3_&multin = omeg1;
r0_omeg2_cd3_&multin = omeg2;
r0_omeg3_cd3_&multin = omeg3;
keep categ fundid_mer val_&multin STATUS_&multin SSE_&multin
r0_fh1_cd3_&multin r0_fh2_cd3_&multin r0_fh3_cd3_&multin r0_fh4_cd3_&multin
r0_fh5_cd3_&multin r0_fh6_cd3_&multin r0_fh7_cd3_&multin r0_fh8_cd3_&multin
r0_omeg1_cd3_&multin r0_omeg2_cd3_&multin r0_omeg3_cd3_&multin;
run;

****GLM - R1 - 3 lags;
%ODSOff
proc nlin data=s3_00_select outest = w_cd3_dim_r1 SMETHOD=GOLDEN noprint;
*ods output parameterestimates= w_cd3_dim_r1_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi omeg3 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1, 0<= omeg3 <= 1;
model glm_retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8
+omeg3*b1*l3_fh1 +omeg3*b2*l3_fh2 +omeg3*b3*l3_fh3 +omeg3*b4*l3_fh4 +omeg3*b5*l3_fh5 +omeg3*b6*l3_fh6 +omeg3*b7*l3_fh7 +omeg3*b8*l3_fh8;
by categ fundid_mer;
run; quit;
%ODSOn

data w_cd3_dim_r1_&multin; *rename and clean;
set w_cd3_dim_r1;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r1_fh1_cd3_&multin = b1;
r1_fh2_cd3_&multin = b2;
r1_fh3_cd3_&multin = b3;
r1_fh4_cd3_&multin = b4;
r1_fh5_cd3_&multin = b5;
r1_fh6_cd3_&multin = b6;
r1_fh7_cd3_&multin = b7;
r1_fh8_cd3_&multin = b8;
r1_omeg1_cd3_&multin = omeg1;
r1_omeg2_cd3_&multin = omeg2;
r1_omeg3_cd3_&multin = omeg3;
keep categ fundid_mer val_&multin STATUS_&multin SSE_&multin
r1_fh1_cd3_&multin r1_fh2_cd3_&multin r1_fh3_cd3_&multin r1_fh4_cd3_&multin
r1_fh5_cd3_&multin r1_fh6_cd3_&multin r1_fh7_cd3_&multin r1_fh8_cd3_&multin
r1_omeg1_cd3_&multin r1_omeg2_cd3_&multin r1_omeg3_cd3_&multin;
run;

****3STEP - R3 - 3 lags;
%ODSOff
proc nlin data=s3_00_select outest = w_cd3_dim_r3 SMETHOD=GOLDEN noprint;
*ods output parameterestimates= w_cd3_dim_r3_est;
parameters b0=0 b1=1 b2=1 b3=1 b4=1 b5=1 b6=1 b7=1 b8=1 omeg1 = &multi omeg2 = &multi omeg3 = &multi; 
bounds 0<= omeg1 <= 1, 0<= omeg2 <= 1, 0<= omeg3 <= 1;
model s3_retrf = b0 
+b1*fh1 +b2*fh2 +b3*fh3 +b4*fh4 +b5*fh5 +b6*fh6 +b7*fh7 +b8*fh8
+omeg1*b1*l1_fh1 +omeg1*b2*l1_fh2 +omeg1*b3*l1_fh3 +omeg1*b4*l1_fh4 +omeg1*b5*l1_fh5 +omeg1*b6*l1_fh6 +omeg1*b7*l1_fh7 +omeg1*b8*l1_fh8
+omeg2*b1*l2_fh1 +omeg2*b2*l2_fh2 +omeg2*b3*l2_fh3 +omeg2*b4*l2_fh4 +omeg2*b5*l2_fh5 +omeg2*b6*l2_fh6 +omeg2*b7*l2_fh7 +omeg2*b8*l2_fh8
+omeg3*b1*l3_fh1 +omeg3*b2*l3_fh2 +omeg3*b3*l3_fh3 +omeg3*b4*l3_fh4 +omeg3*b5*l3_fh5 +omeg3*b6*l3_fh6 +omeg3*b7*l3_fh7 +omeg3*b8*l3_fh8;
by categ fundid_mer;

run; quit;
%ODSOn

data w_cd3_dim_r3_&multin; *rename and clean;
set w_cd3_dim_r3;
if _TYPE_ = "FINAL";
val_&multin = &multi;
STATUS_&multin = _STATUS_;
SSE_&multin = _SSE_;
r3_fh1_cd3_&multin = b1;
r3_fh2_cd3_&multin = b2;
r3_fh3_cd3_&multin = b3;
r3_fh4_cd3_&multin = b4;
r3_fh5_cd3_&multin = b5;
r3_fh6_cd3_&multin = b6;
r3_fh7_cd3_&multin = b7;
r3_fh8_cd3_&multin = b8;
r3_omeg1_cd3_&multin = omeg1;
r3_omeg2_cd3_&multin = omeg2;
r3_omeg3_cd3_&multin = omeg3;
keep categ fundid_mer val_&multin STATUS_&multin SSE_&multin
r3_fh1_cd3_&multin r3_fh2_cd3_&multin r3_fh3_cd3_&multin r3_fh4_cd3_&multin
r3_fh5_cd3_&multin r3_fh6_cd3_&multin r3_fh7_cd3_&multin r3_fh8_cd3_&multin
r3_omeg1_cd3_&multin r3_omeg2_cd3_&multin r3_omeg3_cd3_&multin;
run;

%end;

***Merge together;
**Merge R0 coefficients and find solution with minimum SSE - 3 lags;
proc sort data = w_fundid_list; by categ fundid_mer; run;
data r0_cd3_coeff;
merge w_fundid_list %do multin=1 %to 91 %by 15; w_cd3_dim_r0_&multin %end; ;
by categ fundid_mer; 
run;
*find the minimum SSE;
data r0_cd3_coeff;
set r0_cd3_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
min_sse_r0cd3 = min_sse;
run;
*Selected parameters;
data r0_cd3_coeff;
set r0_cd3_coeff;
r0cd3_min_sse_omeg_start = val_1;
r0cd3_best_status = STATUS_1;
r0cd3_best_fh1 = r0_fh1_cd3_1;
r0cd3_best_fh2 = r0_fh2_cd3_1;
r0cd3_best_fh3 = r0_fh3_cd3_1;
r0cd3_best_fh4 = r0_fh4_cd3_1;
r0cd3_best_fh5 = r0_fh5_cd3_1;
r0cd3_best_fh6 = r0_fh6_cd3_1;
r0cd3_best_fh7 = r0_fh7_cd3_1;
r0cd3_best_fh8 = r0_fh8_cd3_1;
r0cd3_best_omega1 = r0_omeg1_cd3_1;
r0cd3_best_omega2 = r0_omeg2_cd3_1;
r0cd3_best_omega3 = r0_omeg3_cd3_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r0cd3_min_sse_omeg_start = val_&multin;
r0cd3_best_status = STATUS_&multin;
r0cd3_best_fh1 = r0_fh1_cd3_&multin;
r0cd3_best_fh2 = r0_fh2_cd3_&multin;
r0cd3_best_fh3 = r0_fh3_cd3_&multin;
r0cd3_best_fh4 = r0_fh4_cd3_&multin;
r0cd3_best_fh5 = r0_fh5_cd3_&multin;
r0cd3_best_fh6 = r0_fh6_cd3_&multin;
r0cd3_best_fh7 = r0_fh7_cd3_&multin;
r0cd3_best_fh8 = r0_fh8_cd3_&multin;
r0cd3_best_omega1 = r0_omeg1_cd3_&multin;
r0cd3_best_omega2 = r0_omeg2_cd3_&multin;
r0cd3_best_omega3 = r0_omeg3_cd3_&multin;
end;
%end;
keep categ fundid_mer r0cd3_best_status r0cd3_min_sse_omeg_start
r0cd3_best_fh1 r0cd3_best_fh2 r0cd3_best_fh3 r0cd3_best_fh4
r0cd3_best_fh5 r0cd3_best_fh6 r0cd3_best_fh7 r0cd3_best_fh8
r0cd3_best_omega1 r0cd3_best_omega2 r0cd3_best_omega3 min_sse_r0cd3;
run;

**Merge R1 coefficients and find solution with minimum SSE - 3 lags;
proc sort data = w_fundid_list; by categ fundid_mer; run;
data r1_cd3_coeff;
merge w_fundid_list %do multin=1 %to 91 %by 15; w_cd3_dim_r1_&multin %end; ;
by categ fundid_mer; 
run;
*find the minimum SSE;
data r1_cd3_coeff;
set r1_cd3_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
min_sse_r1cd3 = min_sse;
run;
*Selected parameters;
data r1_cd3_coeff;
set r1_cd3_coeff;
r1cd3_min_sse_omeg_start = val_1;
r1cd3_best_status = STATUS_1;
r1cd3_best_fh1 = r1_fh1_cd3_1;
r1cd3_best_fh2 = r1_fh2_cd3_1;
r1cd3_best_fh3 = r1_fh3_cd3_1;
r1cd3_best_fh4 = r1_fh4_cd3_1;
r1cd3_best_fh5 = r1_fh5_cd3_1;
r1cd3_best_fh6 = r1_fh6_cd3_1;
r1cd3_best_fh7 = r1_fh7_cd3_1;
r1cd3_best_fh8 = r1_fh8_cd3_1;
r1cd3_best_omega1 = r1_omeg1_cd3_1;
r1cd3_best_omega2 = r1_omeg2_cd3_1;
r1cd3_best_omega3 = r1_omeg3_cd3_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r1cd3_min_sse_omeg_start = val_&multin;
r1cd3_best_status = STATUS_&multin;
r1cd3_best_fh1 = r1_fh1_cd3_&multin;
r1cd3_best_fh2 = r1_fh2_cd3_&multin;
r1cd3_best_fh3 = r1_fh3_cd3_&multin;
r1cd3_best_fh4 = r1_fh4_cd3_&multin;
r1cd3_best_fh5 = r1_fh5_cd3_&multin;
r1cd3_best_fh6 = r1_fh6_cd3_&multin;
r1cd3_best_fh7 = r1_fh7_cd3_&multin;
r1cd3_best_fh8 = r1_fh8_cd3_&multin;
r1cd3_best_omega1 = r1_omeg1_cd3_&multin;
r1cd3_best_omega2 = r1_omeg2_cd3_&multin;
r1cd3_best_omega3 = r1_omeg3_cd3_&multin;
end;
%end;
keep categ fundid_mer r1cd3_best_status r1cd3_min_sse_omeg_start
r1cd3_best_fh1 r1cd3_best_fh2 r1cd3_best_fh3 r1cd3_best_fh4
r1cd3_best_fh5 r1cd3_best_fh6 r1cd3_best_fh7 r1cd3_best_fh8
r1cd3_best_omega1 r1cd3_best_omega2 r1cd3_best_omega3 min_sse_r1cd3;
run;

**Merge R3 coefficients and find solution with minimum SSE - 3 lags;
proc sort data = w_fundid_list; by categ fundid_mer; run;
data r3_cd3_coeff;
merge w_fundid_list %do multin=1 %to 91 %by 15; w_cd3_dim_r3_&multin %end; ;
by categ fundid_mer; 
run;
*find the minimum SSE;
data r3_cd3_coeff;
set r3_cd3_coeff;
min_sse = min(sse_1 %do multin=16 %to 91 %by 15; ,SSE_&multin %end;);
min_sse_r3cd3 = min_sse;
run;
*Selected parameters;
data r3_cd3_coeff;
set r3_cd3_coeff;
r3cd3_min_sse_omeg_start = val_1;
r3cd3_best_status = STATUS_1;
r3cd3_best_fh1 = r3_fh1_cd3_1;
r3cd3_best_fh2 = r3_fh2_cd3_1;
r3cd3_best_fh3 = r3_fh3_cd3_1;
r3cd3_best_fh4 = r3_fh4_cd3_1;
r3cd3_best_fh5 = r3_fh5_cd3_1;
r3cd3_best_fh6 = r3_fh6_cd3_1;
r3cd3_best_fh7 = r3_fh7_cd3_1;
r3cd3_best_fh8 = r3_fh8_cd3_1;
r3cd3_best_omega1 = r3_omeg1_cd3_1;
r3cd3_best_omega2 = r3_omeg2_cd3_1;
r3cd3_best_omega3 = r3_omeg3_cd3_1;
%do multin=16 %to 91 %by 15;
if min_sse = SSE_&multin then do;
r3cd3_min_sse_omeg_start = val_&multin;
r3cd3_best_status = STATUS_&multin;
r3cd3_best_fh1 = r3_fh1_cd3_&multin;
r3cd3_best_fh2 = r3_fh2_cd3_&multin;
r3cd3_best_fh3 = r3_fh3_cd3_&multin;
r3cd3_best_fh4 = r3_fh4_cd3_&multin;
r3cd3_best_fh5 = r3_fh5_cd3_&multin;
r3cd3_best_fh6 = r3_fh6_cd3_&multin;
r3cd3_best_fh7 = r3_fh7_cd3_&multin;
r3cd3_best_fh8 = r3_fh8_cd3_&multin;
r3cd3_best_omega1 = r3_omeg1_cd3_&multin;
r3cd3_best_omega2 = r3_omeg2_cd3_&multin;
r3cd3_best_omega3 = r3_omeg3_cd3_&multin;
end;
%end;
keep categ fundid_mer r3cd3_best_status r3cd3_min_sse_omeg_start
r3cd3_best_fh1 r3cd3_best_fh2 r3cd3_best_fh3 r3cd3_best_fh4
r3cd3_best_fh5 r3cd3_best_fh6 r3cd3_best_fh7 r3cd3_best_fh8
r3cd3_best_omega1 r3cd3_best_omega2 r3cd3_best_omega3 min_sse_r3cd3;
run;

%mend cdims;
%cdims;




*Add the CDimson coefficients to the other ones;
proc sort data = v_s3_fund_fh00_coef_all; by categ fundid_mer; run;

proc sort data = r0_cd3_coeff; by categ fundid_mer; run;
proc sort data = r1_cd3_coeff; by categ fundid_mer; run;
proc sort data = r3_cd3_coeff; by categ fundid_mer; run;


data v_s3_fund_fh00_coef_all;
merge v_s3_fund_fh00_coef_all(in=a)
r0_cd3_coeff r1_cd3_coeff r3_cd3_coeff; 
by categ fundid_mer; 
if a;
run;

*cd coefficients, sum;
data v_s3_fund_fh00_coef_all;
set v_s3_fund_fh00_coef_all;
*R0, CDmison, 3 lags;
r0cd3_fh1_sum = r0cd3_best_fh1 +r0cd3_best_omega1*r0cd3_best_fh1 +r0cd3_best_omega2*r0cd3_best_fh1 +r0cd3_best_omega3*r0cd3_best_fh1;
r0cd3_fh2_sum = r0cd3_best_fh2 +r0cd3_best_omega1*r0cd3_best_fh2 +r0cd3_best_omega2*r0cd3_best_fh2 +r0cd3_best_omega3*r0cd3_best_fh2;
r0cd3_fh3_sum = r0cd3_best_fh3 +r0cd3_best_omega1*r0cd3_best_fh3 +r0cd3_best_omega2*r0cd3_best_fh3 +r0cd3_best_omega3*r0cd3_best_fh3;
r0cd3_fh4_sum = r0cd3_best_fh4 +r0cd3_best_omega1*r0cd3_best_fh4 +r0cd3_best_omega2*r0cd3_best_fh4 +r0cd3_best_omega3*r0cd3_best_fh4;
r0cd3_fh5_sum = r0cd3_best_fh5 +r0cd3_best_omega1*r0cd3_best_fh5 +r0cd3_best_omega2*r0cd3_best_fh5 +r0cd3_best_omega3*r0cd3_best_fh5;
r0cd3_fh6_sum = r0cd3_best_fh6 +r0cd3_best_omega1*r0cd3_best_fh6 +r0cd3_best_omega2*r0cd3_best_fh6 +r0cd3_best_omega3*r0cd3_best_fh6;
r0cd3_fh7_sum = r0cd3_best_fh7 +r0cd3_best_omega1*r0cd3_best_fh7 +r0cd3_best_omega2*r0cd3_best_fh7 +r0cd3_best_omega3*r0cd3_best_fh7;
r0cd3_fh8_sum = r0cd3_best_fh8 +r0cd3_best_omega1*r0cd3_best_fh8 +r0cd3_best_omega2*r0cd3_best_fh8 +r0cd3_best_omega3*r0cd3_best_fh8;
*R1, CDmison, 3 lags;
r1cd3_fh1_sum = r1cd3_best_fh1 +r1cd3_best_omega1*r1cd3_best_fh1 +r1cd3_best_omega2*r1cd3_best_fh1 +r1cd3_best_omega3*r1cd3_best_fh1;
r1cd3_fh2_sum = r1cd3_best_fh2 +r1cd3_best_omega1*r1cd3_best_fh2 +r1cd3_best_omega2*r1cd3_best_fh2 +r1cd3_best_omega3*r1cd3_best_fh2;
r1cd3_fh3_sum = r1cd3_best_fh3 +r1cd3_best_omega1*r1cd3_best_fh3 +r1cd3_best_omega2*r1cd3_best_fh3 +r1cd3_best_omega3*r1cd3_best_fh3;
r1cd3_fh4_sum = r1cd3_best_fh4 +r1cd3_best_omega1*r1cd3_best_fh4 +r1cd3_best_omega2*r1cd3_best_fh4 +r1cd3_best_omega3*r1cd3_best_fh4;
r1cd3_fh5_sum = r1cd3_best_fh5 +r1cd3_best_omega1*r1cd3_best_fh5 +r1cd3_best_omega2*r1cd3_best_fh5 +r1cd3_best_omega3*r1cd3_best_fh5;
r1cd3_fh6_sum = r1cd3_best_fh6 +r1cd3_best_omega1*r1cd3_best_fh6 +r1cd3_best_omega2*r1cd3_best_fh6 +r1cd3_best_omega3*r1cd3_best_fh6;
r1cd3_fh7_sum = r1cd3_best_fh7 +r1cd3_best_omega1*r1cd3_best_fh7 +r1cd3_best_omega2*r1cd3_best_fh7 +r1cd3_best_omega3*r1cd3_best_fh7;
r1cd3_fh8_sum = r1cd3_best_fh8 +r1cd3_best_omega1*r1cd3_best_fh8 +r1cd3_best_omega2*r1cd3_best_fh8 +r1cd3_best_omega3*r1cd3_best_fh8;
*R3, CDmison, 3 lags;
r3cd3_fh1_sum = r3cd3_best_fh1 +r3cd3_best_omega1*r3cd3_best_fh1 +r3cd3_best_omega2*r3cd3_best_fh1 +r3cd3_best_omega3*r3cd3_best_fh1;
r3cd3_fh2_sum = r3cd3_best_fh2 +r3cd3_best_omega1*r3cd3_best_fh2 +r3cd3_best_omega2*r3cd3_best_fh2 +r3cd3_best_omega3*r3cd3_best_fh2;
r3cd3_fh3_sum = r3cd3_best_fh3 +r3cd3_best_omega1*r3cd3_best_fh3 +r3cd3_best_omega2*r3cd3_best_fh3 +r3cd3_best_omega3*r3cd3_best_fh3;
r3cd3_fh4_sum = r3cd3_best_fh4 +r3cd3_best_omega1*r3cd3_best_fh4 +r3cd3_best_omega2*r3cd3_best_fh4 +r3cd3_best_omega3*r3cd3_best_fh4;
r3cd3_fh5_sum = r3cd3_best_fh5 +r3cd3_best_omega1*r3cd3_best_fh5 +r3cd3_best_omega2*r3cd3_best_fh5 +r3cd3_best_omega3*r3cd3_best_fh5;
r3cd3_fh6_sum = r3cd3_best_fh6 +r3cd3_best_omega1*r3cd3_best_fh6 +r3cd3_best_omega2*r3cd3_best_fh6 +r3cd3_best_omega3*r3cd3_best_fh6;
r3cd3_fh7_sum = r3cd3_best_fh7 +r3cd3_best_omega1*r3cd3_best_fh7 +r3cd3_best_omega2*r3cd3_best_fh7 +r3cd3_best_omega3*r3cd3_best_fh7;
r3cd3_fh8_sum = r3cd3_best_fh8 +r3cd3_best_omega1*r3cd3_best_fh8 +r3cd3_best_omega2*r3cd3_best_fh8 +r3cd3_best_omega3*r3cd3_best_fh8;
run;


*Check omega by category;
proc sort data = v_s3_fund_fh00_coef_all; by aut_rank categ; run;
proc summary data = v_s3_fund_fh00_coef_all; 
var r0cd3_best_omega1 r0cd3_best_omega2 r0cd3_best_omega3
r1cd3_best_omega1 r1cd3_best_omega2 r1cd3_best_omega3
r3cd3_best_omega1 r3cd3_best_omega2 r3cd3_best_omega3;
output out = v_omegas_cat
mean = p10 = p90 = /autoname;
by aut_rank categ; 
run;

**************;

data v_rets;
set s3_00_select;
keep retrf glm_retrf s3_retrf fund_seq yyyymm
fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8
l1_fh1 l1_fh2 l1_fh3 l1_fh4 l1_fh5 l1_fh6 l1_fh7 l1_fh8
l2_fh1 l2_fh2 l2_fh3 l2_fh4 l2_fh5 l2_fh6 l2_fh7 l2_fh8
l3_fh1 l3_fh2 l3_fh3 l3_fh4 l3_fh5 l3_fh6 l3_fh7 l3_fh8
aut_rank categ fundid_mer; 
run;

proc sort data = v_rets; by aut_rank categ fundid_mer; run;
proc sort data = v_s3_fund_fh00_coef_all; by aut_rank categ fundid_mer; run;

data v_rets_betas;
merge v_rets v_s3_fund_fh00_coef_all;
by aut_rank categ fundid_mer;
run;

*calculate monthly alphas;
data v_rets_betas;
set v_rets_betas;
*contemporaneous factors;
r0_fh_a = retrf -r0_fh1*fh1 -r0_fh2*fh2 -r0_fh3*fh3 -r0_fh4*fh4 
-r0_fh5*fh5 -r0_fh6*fh6 -r0_fh7*fh7 -r0_fh8*fh8;

r1_fh_a = glm_retrf -r1_fh1*fh1 -r1_fh2*fh2 -r1_fh3*fh3 -r1_fh4*fh4 
-r1_fh5*fh5 -r1_fh6*fh6 -r1_fh7*fh7 -r1_fh8*fh8;

r3_fh_a = s3_retrf -r3_fh1*fh1 -r3_fh2*fh2 -r3_fh3*fh3 -r3_fh4*fh4 
-r3_fh5*fh5 -r3_fh6*fh6 -r3_fh7*fh7 -r3_fh8*fh8;
*Dimson, 1 lag, v1;
r0_fh_a_d1v1 = retrf -r0_fh1_d1_l0*fh1 -r0_fh2_d1_l0*fh2 -r0_fh3_d1_l0*fh3 -r0_fh4_d1_l0*fh4 
-r0_fh5_d1_l0*fh5 -r0_fh6_d1_l0*fh6 -r0_fh7_d1_l0*fh7 -r0_fh8_d1_l0*fh8
-r0_fh1_d1_l1*l1_fh1 -r0_fh2_d1_l1*l1_fh2 -r0_fh3_d1_l1*l1_fh3 -r0_fh4_d1_l1*l1_fh4 
-r0_fh5_d1_l1*l1_fh5 -r0_fh6_d1_l1*l1_fh6 -r0_fh7_d1_l1*l1_fh7 -r0_fh8_d1_l1*l1_fh8;

r1_fh_a_d1v1 = glm_retrf -r1_fh1_d1_l0*fh1 -r1_fh2_d1_l0*fh2 -r1_fh3_d1_l0*fh3 -r1_fh4_d1_l0*fh4 
-r1_fh5_d1_l0*fh5 -r1_fh6_d1_l0*fh6 -r1_fh7_d1_l0*fh7 -r1_fh8_d1_l0*fh8
-r1_fh1_d1_l1*l1_fh1 -r1_fh2_d1_l1*l1_fh2 -r1_fh3_d1_l1*l1_fh3 -r1_fh4_d1_l1*l1_fh4 
-r1_fh5_d1_l1*l1_fh5 -r1_fh6_d1_l1*l1_fh6 -r1_fh7_d1_l1*l1_fh7 -r1_fh8_d1_l1*l1_fh8;

r3_fh_a_d1v1 = s3_retrf -r3_fh1_d1_l0*fh1 -r3_fh2_d1_l0*fh2 -r3_fh3_d1_l0*fh3 -r3_fh4_d1_l0*fh4 
-r3_fh5_d1_l0*fh5 -r3_fh6_d1_l0*fh6 -r3_fh7_d1_l0*fh7 -r3_fh8_d1_l0*fh8
-r3_fh1_d1_l1*l1_fh1 -r3_fh2_d1_l1*l1_fh2 -r3_fh3_d1_l1*l1_fh3 -r3_fh4_d1_l1*l1_fh4 
-r3_fh5_d1_l1*l1_fh5 -r3_fh6_d1_l1*l1_fh6 -r3_fh7_d1_l1*l1_fh7 -r3_fh8_d1_l1*l1_fh8;
*Dimson, 1 lag, v2 (beta=sum);
r0_fh_a_d1v2 = retrf -r0_fh1_d1_sum*fh1 -r0_fh2_d1_sum*fh2 -r0_fh3_d1_sum*fh3 -r0_fh4_d1_sum*fh4 
-r0_fh5_d1_sum*fh5 -r0_fh6_d1_sum*fh6 -r0_fh7_d1_sum*fh7 -r0_fh8_d1_sum*fh8;

r1_fh_a_d1v2 = retrf -r1_fh1_d1_sum*fh1 -r1_fh2_d1_sum*fh2 -r1_fh3_d1_sum*fh3 -r1_fh4_d1_sum*fh4 
-r1_fh5_d1_sum*fh5 -r1_fh6_d1_sum*fh6 -r1_fh7_d1_sum*fh7 -r1_fh8_d1_sum*fh8;

r3_fh_a_d1v2 = retrf -r3_fh1_d1_sum*fh1 -r3_fh2_d1_sum*fh2 -r3_fh3_d1_sum*fh3 -r3_fh4_d1_sum*fh4 
-r3_fh5_d1_sum*fh5 -r3_fh6_d1_sum*fh6 -r3_fh7_d1_sum*fh7 -r3_fh8_d1_sum*fh8;
*Dimson, 2 lag, v1;
r0_fh_a_d2v1 = retrf -r0_fh1_d2_l0*fh1 -r0_fh2_d2_l0*fh2 -r0_fh3_d2_l0*fh3 -r0_fh4_d2_l0*fh4 
-r0_fh5_d2_l0*fh5 -r0_fh6_d2_l0*fh6 -r0_fh7_d2_l0*fh7 -r0_fh8_d2_l0*fh8
-r0_fh1_d2_l1*l1_fh1 -r0_fh2_d2_l1*l1_fh2 -r0_fh3_d2_l1*l1_fh3 -r0_fh4_d2_l1*l1_fh4 
-r0_fh5_d2_l1*l1_fh5 -r0_fh6_d2_l1*l1_fh6 -r0_fh7_d2_l1*l1_fh7 -r0_fh8_d2_l1*l1_fh8
-r0_fh1_d2_l2*l2_fh1 -r0_fh2_d2_l2*l2_fh2 -r0_fh3_d2_l2*l2_fh3 -r0_fh4_d2_l2*l2_fh4 
-r0_fh5_d2_l2*l2_fh5 -r0_fh6_d2_l2*l2_fh6 -r0_fh7_d2_l2*l2_fh7 -r0_fh8_d2_l2*l2_fh8;
*Dimson, 2 lag, v2 (beta=sum);
r0_fh_a_d2v2 = retrf -r0_fh1_d2_sum*fh1 -r0_fh2_d2_sum*fh2 -r0_fh3_d2_sum*fh3 -r0_fh4_d2_sum*fh4 
-r0_fh5_d2_sum*fh5 -r0_fh6_d2_sum*fh6 -r0_fh7_d2_sum*fh7 -r0_fh8_d2_sum*fh8;

r1_fh_a_d2v2 = glm_retrf -r1_fh1_d2_sum*fh1 -r1_fh2_d2_sum*fh2 -r1_fh3_d2_sum*fh3 -r1_fh4_d2_sum*fh4 
-r1_fh5_d2_sum*fh5 -r1_fh6_d2_sum*fh6 -r1_fh7_d2_sum*fh7 -r1_fh8_d2_sum*fh8;

r3_fh_a_d2v2 = s3_retrf -r3_fh1_d2_sum*fh1 -r3_fh2_d2_sum*fh2 -r3_fh3_d2_sum*fh3 -r3_fh4_d2_sum*fh4 
-r3_fh5_d2_sum*fh5 -r3_fh6_d2_sum*fh6 -r3_fh7_d2_sum*fh7 -r3_fh8_d2_sum*fh8;
*Dimson, 3 lag, v2 (beta=sum);
r0_fh_a_d3v2 = retrf -r0_fh1_d3_sum*fh1 -r0_fh2_d3_sum*fh2 -r0_fh3_d3_sum*fh3 -r0_fh4_d3_sum*fh4 
-r0_fh5_d3_sum*fh5 -r0_fh6_d3_sum*fh6 -r0_fh7_d3_sum*fh7 -r0_fh8_d3_sum*fh8;

r1_fh_a_d3v2 = glm_retrf -r1_fh1_d3_sum*fh1 -r1_fh2_d3_sum*fh2 -r1_fh3_d3_sum*fh3 -r1_fh4_d3_sum*fh4 
-r1_fh5_d3_sum*fh5 -r1_fh6_d3_sum*fh6 -r1_fh7_d3_sum*fh7 -r1_fh8_d3_sum*fh8;

r3_fh_a_d3v2 = s3_retrf -r3_fh1_d3_sum*fh1 -r3_fh2_d3_sum*fh2 -r3_fh3_d3_sum*fh3 -r3_fh4_d3_sum*fh4 
-r3_fh5_d3_sum*fh5 -r3_fh6_d3_sum*fh6 -r3_fh7_d3_sum*fh7 -r3_fh8_d3_sum*fh8;
*CDimson, 3 lag, v2 (beta=sum);
r0_fh_a_cd3v2 = retrf -r0cd3_fh1_sum*fh1 -r0cd3_fh2_sum*fh2 -r0cd3_fh3_sum*fh3 -r0cd3_fh4_sum*fh4 
-r0cd3_fh5_sum*fh5 -r0cd3_fh6_sum*fh6 -r0cd3_fh7_sum*fh7 -r0cd3_fh8_sum*fh8;

r1_fh_a_cd3v2 = glm_retrf -r1cd3_fh1_sum*fh1 -r1cd3_fh2_sum*fh2 -r1cd3_fh3_sum*fh3 -r1cd3_fh4_sum*fh4 
-r1cd3_fh5_sum*fh5 -r1cd3_fh6_sum*fh6 -r1cd3_fh7_sum*fh7 -r1cd3_fh8_sum*fh8;

r3_fh_a_cd3v2 = s3_retrf -r3cd3_fh1_sum*fh1 -r3cd3_fh2_sum*fh2 -r3cd3_fh3_sum*fh3 -r3cd3_fh4_sum*fh4 
-r3cd3_fh5_sum*fh5 -r3cd3_fh6_sum*fh6 -r3cd3_fh7_sum*fh7 -r3cd3_fh8_sum*fh8;
run;

data v_rets_betas_gt12mm;
set v_rets_betas;
r0 = retrf;
r1 = glm_retrf;
r3 = s3_retrf;
run;

*calculate average alpha and average t by fund, also average std dev of returns;
proc sort data = v_rets_betas_gt12mm; by aut_rank categ fundid_mer; run;
proc summary data = v_rets_betas_gt12mm; 
var r0_fh_a r1_fh_a r3_fh_a r0 r1 r3;
output out = fund_fhalpha_avg
mean = r0_fh_a r1_fh_a r3_fh_a del1 del2 del3
std = del4 del5 del6 r0_std r1_std r3_std;
by aut_rank categ fundid_mer; 
run;
data fund_fhalpha_avg; set fund_fhalpha_avg; drop del1 del2 del3 del4 del5 del6; run;

proc summary data = v_rets_betas_gt12mm; 
var r0_fh_a r1_fh_a r3_fh_a;
output out = fund_fhalpha_t
t = r0_fh_a r1_fh_a r3_fh_a;
by aut_rank categ fundid_mer; 
run;

*attach fund alpha to file with coefficients;
data s3_fund_fh00_coef_all;
merge fund_fhalpha_avg v_s3_fund_fh00_coef_all;
by aut_rank categ fundid_mer;
drop _TYPE_ _FREQ_;
run;

*Mean value by category;
proc sort data = s3_fund_fh00_coef_all; by aut_rank categ fundid_mer; run;

proc summary data = s3_fund_fh00_coef_all;
var r0_fh_a r1_fh_a r3_fh_a;
output out = s3_fund_fh00_coef_all_av
mean = r0_fh_a r1_fh_a r3_fh_a;
by aut_rank categ;
run;

data s3_fund_fh00_coef_all_av; 
set s3_fund_fh00_coef_all_av; 
stat = 'average'; 
drop _TYPE_ _FREQ_; 
run;

*Compare alphas with Dimson method;
*calculate average alpha by fund;
data v_rets_betas_gt12mm;
set v_rets_betas_gt12mm;
run;

*Mean value by fund;
proc sort data = v_rets_betas_gt12mm; by aut_rank categ fundid_mer; run;
proc summary data = v_rets_betas_gt12mm; 
var 
r0_fh_a r0_fh_a_d1v2 r0_fh_a_d2v2 r0_fh_a_d3v2 r0_fh_a_cd3v2
r1_fh_a r1_fh_a_d1v2 r1_fh_a_d2v2 r1_fh_a_d3v2 r1_fh_a_cd3v2
r3_fh_a r3_fh_a_d1v2 r3_fh_a_d2v2 r3_fh_a_d3v2 r3_fh_a_cd3v2;
output out = v_fund_fhalpha_avg_dims
mean = 
r0_fh_a r0_fh_a_d1v2 r0_fh_a_d2v2 r0_fh_a_d3v2 r0_fh_a_cd3v2
r1_fh_a r1_fh_a_d1v2 r1_fh_a_d2v2 r1_fh_a_d3v2 r1_fh_a_cd3v2
r3_fh_a r3_fh_a_d1v2 r3_fh_a_d2v2 r3_fh_a_d3v2 r3_fh_a_cd3v2;
by aut_rank categ fundid_mer; 
run;

*Mean value by category;
proc sort data = v_fund_fhalpha_avg_dims; by aut_rank categ; run;
proc summary data = v_fund_fhalpha_avg_dims;
var 
r0_fh_a r0_fh_a_d1v2 r0_fh_a_d2v2 r0_fh_a_d3v2 r0_fh_a_cd3v2
r1_fh_a r1_fh_a_d1v2 r1_fh_a_d2v2 r1_fh_a_d3v2 r1_fh_a_cd3v2
r3_fh_a r3_fh_a_d1v2 r3_fh_a_d2v2 r3_fh_a_d3v2 r3_fh_a_cd3v2;
output out = dims_fhalpha_avg
mean = 
r0a r0a_dim1 r0a_dim2 r0a_dim3 r0a_cdim3
r1a r1a_dim1 r1a_dim2 r1a_dim3 r1a_cdim3
r3a r3a_dim1 r3a_dim2 r3a_dim3 r3a_cdim3;
by aut_rank categ;
run;


***************************************************;
**** To create table, need to take difference between estimated and true alpha;
**** Obtain true alpha by directly regressing economic returns onto factors;

proc sort data = vread_hf00; by categ fundid_mer yyyymm; run;
proc reg data = vread_hf00 outest = v_eret_alpha noprint tableout;
model eret_adj = fh1 fh2 fh3 fh4 fh5 fh6 fh7 fh8 /edf ADJRSQ;
by categ fundid_mer; 
run; quit;

data v_eret_alpha;
set v_eret_alpha;
if _TYPE_ = 'PARMS';
eret_alp = intercept;
reg_obs = _P_ + _EDF_;
keep categ fundid_mer eret_alp;
run;

*Attach real alpha to various estimated alphas, then take the difference;
proc sort data = v_fund_fhalpha_avg_dims; by categ fundid_mer; run;
proc sort data = v_eret_alpha nodupkey; by categ fundid_mer; run;

data v_all_alphas;
merge v_fund_fhalpha_avg_dims(in=a) v_eret_alpha; 
by categ fundid_mer; 
if a;
*compute difference and annualize;
r0_stand_diff = (r0_fh_a - eret_alp)*12;
r0_dim_diff = (r0_fh_a_d3v2 - eret_alp)*12;
r0_cdim_diff = (r0_fh_a_cd3v2 - eret_alp)*12;

r1_stand_diff = (r1_fh_a - eret_alp)*12;
r1_dim_diff = (r1_fh_a_d3v2 - eret_alp)*12;
r1_cdim_diff = (r1_fh_a_cd3v2 - eret_alp)*12;

r3_stand_diff = (r3_fh_a - eret_alp)*12;
r3_dim_diff = (r3_fh_a_d3v2 - eret_alp)*12;
r3_cdim_diff = (r3_fh_a_cd3v2 - eret_alp)*12;
*compute absolute difference;
r0_stand_absdiff = abs(r0_stand_diff);
r0_dim_absdiff = abs(r0_dim_diff);
r0_cdim_absdiff = abs(r0_cdim_diff);

r1_stand_absdiff = abs(r1_stand_diff);
r1_dim_absdiff = abs(r1_dim_diff);
r1_cdim_absdiff = abs(r1_cdim_diff);

r3_stand_absdiff = abs(r3_stand_diff);
r3_dim_absdiff = abs(r3_dim_diff);
r3_cdim_absdiff = abs(r3_cdim_diff);

*compute MSQE;
r0_stand_msqe = (r0_stand_diff)**2;
r0_dim_msqe = (r0_dim_diff)**2;
r0_cdim_msqe = (r0_cdim_diff)**2;

r1_stand_msqe = (r1_stand_diff)**2;
r1_dim_msqe = (r1_dim_diff)**2;
r1_cdim_msqe = (r1_cdim_diff)**2;

r3_stand_msqe = (r3_stand_diff)**2;
r3_dim_msqe = (r3_dim_diff)**2;
r3_cdim_msqe = (r3_cdim_diff)**2;

*dummy sequence number (this is simulated data);
dum_seq = round((_n_+10)/50,1); 
run;

*Produce table;
proc sort data = v_all_alphas; by categ; run;
proc summary data = v_all_alphas;
var r0_stand_diff r1_stand_diff r3_stand_diff
r0_dim_diff r1_dim_diff r3_dim_diff
r0_cdim_diff r1_cdim_diff r3_cdim_diff
r0_stand_absdiff r1_stand_absdiff r3_stand_absdiff
r0_dim_absdiff r1_dim_absdiff r3_dim_absdiff
r0_cdim_absdiff r1_cdim_absdiff r3_cdim_absdiff
r0_stand_msqe r1_stand_msqe r3_stand_msqe
r0_dim_msqe r1_dim_msqe r3_dim_msqe 
r0_cdim_msqe r1_cdim_msqe r3_cdim_msqe;
output out = v_alphas_bycateg
mean = r0_stand_diff r1_stand_diff r3_stand_diff
r0_dim_diff r1_dim_diff r3_dim_diff
r0_cdim_diff r1_cdim_diff r3_cdim_diff
r0_stand_absdiff r1_stand_absdiff r3_stand_absdiff
r0_dim_absdiff r1_dim_absdiff r3_dim_absdiff
r0_cdim_absdiff r1_cdim_absdiff r3_cdim_absdiff
r0_stand_msqe r1_stand_msqe r3_stand_msqe
r0_dim_msqe r1_dim_msqe r3_dim_msqe 
r0_cdim_msqe r1_cdim_msqe r3_cdim_msqe;
by categ; 
run;

*Now average across simulations ("categ") and create table;
*Row 1 (R0, alpha differences);
proc summary data = v_alphas_bycateg;
var r0_stand_diff r0_dim_diff r0_cdim_diff;
output out = w_alphas_r0diff
mean = stand dims cdim;
run; 

*Row 2 (R1, alpha differences);
proc summary data = v_alphas_bycateg;
var r1_stand_diff r1_dim_diff r1_cdim_diff;
output out = w_alphas_r1diff
mean = stand dims cdim;
run; 

*Row 3 (R3, alpha differences);
proc summary data = v_alphas_bycateg;
var r3_stand_diff r3_dim_diff r3_cdim_diff;
output out = w_alphas_r3diff
mean = stand dims cdim;
run; 

*Row 4 (R0, alpha MAE);
proc summary data = v_alphas_bycateg;
var r0_stand_absdiff r0_dim_absdiff r0_cdim_absdiff;
output out = w_alphas_r0absdiff
mean = stand dims cdim;
run; 

*Row 5 (R1, alpha MAE);
proc summary data = v_alphas_bycateg;
var r1_stand_absdiff r1_dim_absdiff r1_cdim_absdiff;
output out = w_alphas_r1absdiff
mean = stand dims cdim;
run; 

*Row 6 (R3, alpha MAE);
proc summary data = v_alphas_bycateg;
var r3_stand_absdiff r3_dim_absdiff r3_cdim_absdiff;
output out = w_alphas_r3absdiff
mean = stand dims cdim;
run; 
*MSQE;
*Row 7 (R0, alpha MSQE);
proc summary data = v_alphas_bycateg;
var r0_stand_msqe r0_dim_msqe r0_cdim_msqe;
output out = w_alphas_r0msqe
mean = stand dims cdim;
run; 

*Row 8 (R1, alpha MSQE);
proc summary data = v_alphas_bycateg;
var r1_stand_msqe r1_dim_msqe r1_cdim_msqe;
output out = w_alphas_r1msqe
mean = stand dims cdim;
run; 

*Row 9 (R3, alpha MSQE);
proc summary data = v_alphas_bycateg;
var r3_stand_msqe r3_dim_msqe r3_cdim_msqe;
output out = w_alphas_r3msqe
mean = stand dims cdim;
run; 

*Set together in table;
data tab_alphas_diff;
set w_alphas_r0diff w_alphas_r1diff w_alphas_r3diff
w_alphas_r0absdiff w_alphas_r1absdiff w_alphas_r3absdiff
w_alphas_r0msqe w_alphas_r1msqe w_alphas_r3msqe;
drop _TYPE_ _FREQ_;
run;

data tab_alphas_diff; *make labels for rows;
set tab_alphas_diff;
var = "absdiff_R3";
if _n_ = 1 then var = "diff_R0";
if _n_ = 2 then var = "diff_R1";
if _n_ = 3 then var = "diff_R3";
if _n_ = 4 then var = "absdiff_R0";
if _n_ = 5 then var = "absdiff_R1";
if _n_ = 7 then var = "mse100_R0";
if _n_ = 8 then var = "mse100_R1";
if _n_ = 9 then var = "mse100_R3";
*multiply MSQE by 100 for reporting purposed;
if _n_ ge 7 then do;
stand = stand*100;
dims = dims*100;
cdim = cdim*100;
end;
run;

*Add % of cases in which errors for R3 is bigger than for R0D or R0CD;
proc sort data = v_all_alphas; by dum_seq; run;
proc summary data = v_all_alphas;
var r3_stand_absdiff r0_dim_absdiff r0_cdim_absdiff
r3_stand_msqe r0_dim_msqe r0_cdim_msqe;
output out = v_alphas_perc_indic_dumseq
mean = r3_stand_absdiff r0_dim_absdiff r0_cdim_absdiff
r3_stand_msqe r0_dim_msqe r0_cdim_msqe;
by dum_seq;
run;

data v_alphas_perc_indic_dumseq;
set v_alphas_perc_indic_dumseq;
*indicator for MAE of R3 being larger than R0D and R0CD; 
alpha_mae_r3_gt_r0d_i = 0; if r3_stand_absdiff gt r0_dim_absdiff then alpha_mae_r3_gt_r0d_i = 1;
alpha_mae_r3_gt_r0cd_i = 0; if r3_stand_absdiff gt r0_cdim_absdiff then alpha_mae_r3_gt_r0cd_i = 1;
*indicator for MSQE of R3 being larger than R0D and R0CD; 
alpha_msqe_r3_gt_r0d_i = 0; if r3_stand_msqe gt r0_dim_msqe then alpha_msqe_r3_gt_r0d_i = 1;
alpha_msqe_r3_gt_r0cd_i = 0; if r3_stand_msqe gt r0_cdim_msqe then alpha_msqe_r3_gt_r0cd_i = 1;
run;

*Average across sims;
proc summary data = v_alphas_perc_indic_dumseq;
var alpha_mae_r3_gt_r0d_i alpha_mae_r3_gt_r0cd_i
alpha_msqe_r3_gt_r0d_i alpha_msqe_r3_gt_r0cd_i;
output out = alphas_perc_indic_all
mean = alpha_mae_r3_gt_r0d_i alpha_mae_r3_gt_r0cd_i
alpha_msqe_r3_gt_r0d_i alpha_msqe_r3_gt_r0cd_i;
run;

data tab_alphas_diff;
set tab_alphas_diff
alphas_perc_indic_all;
drop _TYPE_ _FREQ_;
run;

*make autocorrelation table;
data v_autoc_all;
merge v_aucorr_fund_r0 v_aucorr_fund_r1 v_aucorr_fund_r3
v_aucorr_aggr_r0 v_aucorr_aggr_r1 v_aucorr_aggr_r3;
run;

proc summary data = v_autoc_all;
var r0_corr1_fund r1_corr1_fund r3_corr1_fund
r0_corr1_aggr r1_corr1_aggr r3_corr1_aggr;
output out = tab_autoc_all
mean = r0_corr1_fund r1_corr1_fund r3_corr1_fund
r0_corr1_aggr r1_corr1_aggr r3_corr1_aggr;
run;



















